In [141]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import pandas as pd
import numpy as np
import math

In [142]:
def seed_torch(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [200]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, heads=3, dropout=0.2):
        super(SelfAttention, self).__init__()
        assert hidden_size % heads == 0
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        
        self.hidden_size = hidden_size
        self.heads = heads
        self.attn_size = int(hidden_size / heads)
    
    def transpose_for_scores(self, x, layer):
        x = layer(x)
        new_shape = x.size()[:-1] + (self.heads, self.attn_size)
        x = x.view(*new_shape).permute(0, 2, 1, 3)
        return x
        
    def forward(self, hidden_stations, attention_mask):
        attention_mask = attention_mask.mask_fill_(attention_mask, -1e9)
        # (batch_size, seq_len, hidden_size)
        hidden_shape = hidden_stations.size()
        # query: (batch_size, heads, seq_len, attn_size)
        query = self.transpose_for_scores(hidden_stations, self.query)
        key = self.transpose_for_scores(hidden_stations, self.key)
        value = self.transpose_for_scores(hidden_stations, self.value)
        
        # (batch_size, heads, query_len, key_len)
        attention_weight = torch.matmul(query, key.transpose(-1,-2)) / math.sqrt(self.attn_size)
        attention_weight = attention_weight + attention_mask
        attention_weight = nn.Softmax(dim=-1)(attention_weight)
        
        attention_weith = self.dropout(attention_weight)
        
        # (batch_size, heads, query_len, attn_size)
        context = torch.matmul(attention_weight, value)
        context = context.permute(0,2,1,3).contiguous()
        context = context.view(*hidden_shape)
        return context

In [194]:
class CNNLayer(nn.Module):
    def __init__(self, hidden_size, filters, seq_len, dropout=0):
        super(CNNLayer, self).__init__()
        self.layer_1 = nn.Conv2d(1, filters, kernel_size=(1, hidden_size))
        self.layer_2 = nn.Conv2d(filters, filters, kernal_size=(seq_len, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(seq_len, 1))
        self.dense = nn.Linear(2*filters, 1)
        # self.dropout = nn.Dropout(dropout)
        self.filters = filters
        
    def forward(self, x):
        # (batch_size, 1, seq_len, hidden_size)
        x = x.unsqueeze(1)
        # (batch_size, filters, seq_len, 1)
        conv1 = F.relu(self.layer_1(x))
        # (batch_size, filters, 1, 1)
        out1 = F.relu(self.layer_2(conv1)).squeeze(2).squeeze(3)
        out2 = self.maxpool(conv1).squeeze(2).squeeze(3)
        out = torch.cat([out1, out2], dim=1)
        out = self.dense(out)
        return out

In [201]:
class MixModel(nn.Module):
    def __init__(self, attn_layer, cnn_layer, pre_train_embedding, freeze=False):
        super(MixModel, self).__init__()
        self.attn_layer = attn_layer
        self.cnn_layer = cnn_layer
        self.embed_layer = nn.Embedding.from_pretrained(pre_train_embedding, freeze=freeze)
        self.init_weights()
        
    def init_weights(self):
        for name, param in self.named_parameters():
            if name.find('embed') > -1:
                continue
            elif name.find('weight') > -1 and len(param.size()) > 1:
                nn.init.xavier_uniform_(param)
        
    def forward(self, x, mask):
        embed = self.embed_layer(x)
        # (batch, seq_len, hidden_size)
        attn = self.attn_layer(embed, mask)
        cnn_mask = mask[:,0,:].squeeze(1).unsqueeze(2)
        cnn_mask = cnn_mask == 0
        cnn_input = attn * cnn_mask
        out = self.cnn_layer(cnn_input)
        return out
    
    def predict(self, dataloader):
        preds = []
        with torch.no_grad():
            for batch in dataloader:
                batch = tuple(t.to(device) for t in batch)
                X_batch, = batch
                preds.append(self.forward(X_batch).data.cpu())
        return torch.cat(preds)

    def predict_proba(self, dataloader):
        return torch.sigmoid(self.predict(dataloader)).data.numpy()

In [197]:
def run_epoch(model, dataloader, optimizer, callbacks=None,
              criterion=nn.BCEWithLogitsLoss(), verbose_step=10000):
    t1 = time.time()
    tr_loss = 0
    for step, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        x_batch, y_batch = batch
        model.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs[:, 0], y_batch.float())
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        if callbacks is not None:
            for func in callbacks:
                func.on_batch_end(model)
        if (step + 1) % verbose_step == 0:
            loss_now = tr_loss / (step + 1)
            print(f'step:{step+1} loss:{loss_now:.7f} time:{time.time() - t1:.1f}s')
    if callbacks is not None:
        for func in callbacks:
            func.on_epoch_end(model)
    return tr_loss / (step + 1)

In [199]:
def model_build(embedding_matrix, head=6, max_seq_len=50, cnn_filter=128):
    embedding_size = len(embedding_matrix[0])
    atten_layer = SelfAttention(hidden_size=embedding_size, heads=heads, dropout=0.2)
    cnn_layer = CNNLayer(hidden_size=embedding_size, filters=128, seq_len=max_seq_len)
    model = MixModel(attn_layer, cnn_layer, pre_train_embedding=embedding_matrix,freeze=False)
    return model


In [202]:
-1e9

-1000000000.0

In [190]:
layer = SelfAttention(6, heads=1, dropout=0)
for name, p in layer.named_parameters():
    p.data = torch.ones(p.size()) * 0.1

In [191]:
context, = layer(x, mask)

In [175]:
weight

tensor([[[[0.3301, 0.3425, 0.3274, 0.0000],
          [0.2857, 0.4566, 0.2577, 0.0000],
          [0.3382, 0.3192, 0.3426, 0.0000],
          [0.2698, 0.4939, 0.2363, 0.0000]]]], grad_fn=<SoftmaxBackward>)

In [176]:
context

tensor([[[0.1495, 0.1495, 0.1495, 0.1495, 0.1495, 0.1495],
         [0.2039, 0.2039, 0.2039, 0.2039, 0.2039, 0.2039],
         [0.1383, 0.1383, 0.1383, 0.1383, 0.1383, 0.1383],
         [0.2215, 0.2215, 0.2215, 0.2215, 0.2215, 0.2215]]],
       grad_fn=<ViewBackward>)

tensor([[[0.1495, 0.1495, 0.1495, 0.1495, 0.1495, 0.1495],
         [0.2039, 0.2039, 0.2039, 0.2039, 0.2039, 0.2039],
         [0.1383, 0.1383, 0.1383, 0.1383, 0.1383, 0.1383],
         [0.2215, 0.2215, 0.2215, 0.2215, 0.2215, 0.2215]]],
       grad_fn=<ViewBackward>)