In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset,DataLoader,TensorDataset

In [2]:
class ScaledDotProductAttention(nn.Module):
    """ Scaled Dot-Product Attention """

    def __init__(self, scale):
        super().__init__()

        self.scale = scale
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):
        u = torch.bmm(q, k.transpose(1, 2)) # 1.Matmul
        u = u / self.scale # 2.Scale

        if mask is not None:
            u = u.masked_fill(mask, -np.inf) # 3.Mask

        attn = self.softmax(u) # 4.Softmax
        output = torch.bmm(attn, v) # 5.Output

        return attn, output

In [24]:
if __name__ == "__main__":
    n_q, n_k, n_v = 2, 4, 4
    d_q, d_k, d_v = 128, 128, 64
    batch = 1

    q = torch.randn(batch, n_q, d_q)
    k = torch.randn(batch, n_k, d_k)
    v = torch.randn(batch, n_v, d_v)
    mask = torch.zeros(batch, n_q, n_k).bool()

    attention = ScaledDotProductAttention(scale=np.power(d_k, 0.5))
    print(attention)
    attn, output = attention(q, k, v, mask=mask)

    print(attn.shape)
#     print(output)

ScaledDotProductAttention(
  (softmax): Softmax(dim=2)
)
torch.Size([1, 2, 4])


In [3]:
class MultiHeadAttention(nn.Module):
    """ Multi-Head Attention """

    def __init__(self, n_head, d_k_, d_v_, d_k, d_v, d_o):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.fc_q = nn.Linear(d_k_, n_head * d_k)
        self.fc_k = nn.Linear(d_k_, n_head * d_k)
        self.fc_v = nn.Linear(d_v_, n_head * d_v)

        self.attention = ScaledDotProductAttention(scale=np.power(d_k, 0.5))

        self.fc_o = nn.Linear(n_head * d_v, d_o)

    def forward(self, q, k, v, mask=None):

        n_head, d_q, d_k, d_v = self.n_head, self.d_k, self.d_k, self.d_v

        batch, n_q, d_q_ = q.size()
        batch, n_k, d_k_ = k.size()
        batch, n_v, d_v_ = v.size()

        q = self.fc_q(q) # 1.单头变多头
        k = self.fc_k(k)
        v = self.fc_v(v)
        q = q.view(batch, n_q, n_head, d_q).permute(2, 0, 1, 3).contiguous().view(-1, n_q, d_q)
        k = k.view(batch, n_k, n_head, d_k).permute(2, 0, 1, 3).contiguous().view(-1, n_k, d_k)
        v = v.view(batch, n_v, n_head, d_v).permute(2, 0, 1, 3).contiguous().view(-1, n_v, d_v)

        if mask is not None:
            mask = mask.repeat(n_head, 1, 1)
        attn, output = self.attention(q, k, v, mask=mask) # 2.当成单头注意力求输出

        output = output.view(n_head, batch, n_q, d_v).permute(1, 2, 0, 3).contiguous().view(batch, n_q, -1) # 3.Concat
        output = self.fc_o(output) # 4.仿射变换得到最终输出

        return attn, output

In [10]:
if __name__ == "__main__":
    n_q, n_k, n_v = 2, 4, 4
    d_q_, d_k_, d_v_ = 128, 128, 64

    q = torch.randn(batch, n_q, d_q_)
    k = torch.randn(batch, n_k, d_k_)
    v = torch.randn(batch, n_v, d_v_)    
    mask = torch.zeros(batch, n_q, n_k).bool()

    mha = MultiHeadAttention(n_head=8, d_k_=128, d_v_=64, d_k=256, d_v=128, d_o=128)
    attn, output = mha(q, k, v, mask=mask)

    print(attn.size())
    print(output.size())

torch.Size([8, 2, 4])
torch.Size([1, 2, 128])


In [4]:
class SelfAttention(nn.Module):
    """ Self-Attention """

    def __init__(self, n_head, d_k, d_v, d_x, d_o,l):
        super(SelfAttention,self).__init__()
        self.wq = nn.Parameter(torch.Tensor(d_x, d_k))
        self.wk = nn.Parameter(torch.Tensor(d_x, d_k))
        self.wv = nn.Parameter(torch.Tensor(d_x, d_v))

        self.mha = MultiHeadAttention(n_head=n_head, d_k_=d_k, d_v_=d_v, d_k=d_k, d_v=d_v, d_o=d_o)
        self.rnn = nn.LSTM(d_o,l)
        self.out = nn.Linear(l,1)

        self.init_parameters()

    def init_parameters(self):
        for param in self.parameters():
            stdv = 1. / np.power(param.size(-1), 0.5)
            param.data.uniform_(-stdv, stdv)

    def forward(self, x, mask=None):
        q = torch.matmul(x, self.wq)   
        k = torch.matmul(x, self.wk)
        v = torch.matmul(x, self.wv)

        attn, output = self.mha(q, k, v, mask=mask)
        output,(hn,cn) = self.rnn(output)
        output = self.out(output[:,:,-1])
        
        

        return attn, output


In [34]:
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))
hn_ = hn.view(-1,20,2)
output.shape
# output[:,:,-1].shape

torch.Size([5, 3, 20])

In [5]:
import h5py
temp=h5py.File(r"D:/downloads/20150105.hdf5","r")
data = temp["vol"][:,:,:,0] + temp['vol'][:,:,:,1]

In [11]:
data.shape

(2309, 31, 601)

In [32]:
y = temp['labels']
type(y)

h5py._hl.dataset.Dataset

In [7]:
x_train = data[:2000]
x_test = data[2000:]
y_train = y[:2000]
y_test = y[2000:]
x_train = torch.from_numpy(x_train).float()
x_test = torch.from_numpy(x_test).float()
y_train = torch.from_numpy(y_train).float()
y_test = torch.from_numpy(y_test).float()
train_dataset = TensorDataset(x_train,y_train)
test_dataset = TensorDataset(x_test,y_test)

train_loader = DataLoader(dataset=train_dataset,batch_size=16,shuffle=True,drop_last=True)
test_loader = DataLoader(dataset=test_dataset,batch_size=16,shuffle=True,drop_last=True)

In [29]:
if __name__ == "__main__":
    n_x = 31
    d_x = 601
    batch = 2309

    x = torch.randn(batch, n_x, d_x)
#     x = torch.from_numpy(data).float()
#     mask = torch.zeros(batch, n_x, n_x).bool()
    mask = None

    selfattn = SelfAttention(n_head=1, d_k=128, d_v=601, d_x=601, d_o=80,l=31)
    attn, output = selfattn(x, mask=mask)

    print(attn.size())
    print(output.size())

torch.Size([2309, 31, 31])
torch.Size([2309, 1])


In [22]:
if __name__ == '__main__':
    learning_rate = 0.001
    epoch_num = 100
    device = torch.device("cuda:0") if torch.cuda.is_available() else "cpu"
    
    train_loss_list = []
    val_loss_list = []
    
    n_x = 31
    d_x = 601
    batch = 16
    
    mask = None
    
    model = SelfAttention(n_head=8, d_k=128, d_v=64, d_x=601, d_o=80,l=31)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
#     optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
#     optimizer = torch.optim.RMSprop(model.parameters(),lr=learning_rate,alpha=0.99,eps=1e-08, weight_decay=0, momentum=0, centered=False)
    loss_fn = torch.nn.MSELoss(reduction='mean')

    for i in range(epoch_num):
        model.train()
        train_loss = 0
        for j,(x,y) in enumerate(train_loader):
            attn, output = model(x, mask=mask)
#             print(output.shape)
            loss = loss_fn(output,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss_list.append(train_loss/(j+1))
#         train_loss_list.append(loss.item())

        model.eval()
        val_loss = 0
        for k,(x,y) in enumerate(test_loader):
            attn,ouput = model(x)
            loss = loss_fn(output,y)
            val_loss += loss.item()
        val_loss_list.append(val_loss/(k+1))
#         val_loss_list.append(loss.item())
        
        print(train_loss_list[i],val_loss_list[i])
    print('ok')

0.577320996761322 0.6131795534962102
0.43530078542232514 0.47101974644159017
0.3532609359025955 0.349568992068893
0.2842854198813438 0.26414469276603897
0.21432039165496827 0.18069208844711906
0.17069423252344132 0.19330543397288574
0.13762293165922165 0.17151709606772975
0.11499442717432976 0.10718378895207455
0.09968396879732609 0.14073630932130313
0.0896404535472393 0.12350674365696154
0.08323125897347927 0.12043668290502146
0.0790260289311409 0.09492863459806693
0.07663456022739411 0.09511879047280863
0.07543644925951958 0.1360648478332319
0.07428130033612251 0.11683905085450724
0.07362787175178528 0.09323332929297497
0.07377113457024097 0.1121812223603851
0.0738011933863163 0.10978324593682039
0.0735549517273903 0.08475092327908466
0.07360251867771149 0.10910187249905184
0.07362504950165749 0.09431852694404752
0.07338588567078114 0.10990006770742566
0.0735882171690464 0.11494392450702817
0.07419277983903885 0.10162455706219924
0.07353162306547165 0.11215839809493015
0.073820559337

In [33]:
class My_loss(nn.Module):
    def __init__(self,p):
        super().__init__()   #没有需要保存的参数和状态信息
        self.p = p
        
    def forward(self, x, y):  # 定义前向的函数运算即可
        return torch.mean(torch.pow((x - y), 2))
    
    def g(self,x):
        return 1 / (1 + torch.exp(-p * (x - torch.mean(x)) / 2 / ))
        