In [14]:
import preprocess
from bulid_model import *
from sklearn.metrics import f1_score

In [26]:
a = torch.randn(3,10)
a[:-1,-4:] = 0
a[-1,-4:] = 0
# lengths = torch.tensor([5,6,6])
lengths = [6,6,5]
b = torch.nn.utils.rnn.pack_padded_sequence(a, lengths, batch_first=True)
c,new_lengths = torch.nn.utils.rnn.pad_packed_sequence(b,batch_first=True)
c.shape, new_lengths, a.shape

(torch.Size([3, 6]), tensor([6, 6, 5]), torch.Size([3, 10]))

In [2]:
train_x, train_mask,train_y, word2vec_embedding = preprocess.run()

In [3]:
def run_epoch(model, dataloader, optimizer, callbacks=None,
              criterion=nn.BCEWithLogitsLoss(), verbose_step=100):
    t1 = time.time()
    tr_loss = 0
    for step, batch in enumerate(dataloader):
        batch = tuple(t.cuda() for t in batch)
        x_batch, m_batch, y_batch = batch
        
        model.zero_grad()
        outputs = model(x_batch, m_batch)
        loss = criterion(outputs[:, 0], y_batch.float())
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        if callbacks is not None:
            for func in callbacks:
                func.on_batch_end(model)
        if (step + 1) % verbose_step == 0:
            loss_now = tr_loss / (step + 1)
            print(f'step:{step+1} loss:{loss_now:.7f} time:{time.time() - t1:.1f}s')
    if callbacks is not None:
        for func in callbacks:
            func.on_epoch_end(model)
    return tr_loss / (step + 1)


def eval_data(model, dataloader, y_eval, threshold=0.3):
    y_prob = []
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.cuda() for t in batch)
            x_batch, m_batch = batch
            outputs = model(x_batch, m_batch)
            outputs = torch.sigmoid(outputs)
            y_prob.append(outputs.data.cpu())
    y_prob = torch.cat(y_prob).data.numpy()
    y_pred = (y_prob>threshold).astype(int)
    f1 = f1_score(y_eval, y_pred)
    print('eval f1 socre:', f1)

In [4]:
class GRUModel(nn.Module):
    def __init__(self, pretrained_embedding, proj_dim=128, rnn_dim=128, n_layers=1, bidirectional=False,
                 padding_idx=0, fix_embedding=False,
                 n_out=1):
        super(GRUModel, self).__init__()
        self.embed_dim = len(pretrained_embedding[0])
        self.n_layers = n_layers
        self.dense_dim = rnn_dim * 2 if bidirectional else rnn_dim
        self.n_out = n_out
        self.bidirectional = bidirectional
        self.fix_embedding = fix_embedding
        self.padding_idx = padding_idx
#         if pretrained_embedding is not None:
        self.embed = nn.Embedding.from_pretrained(pretrained_embedding, freeze=fix_embedding)
        self.embed.padding_idx = self.padding_idx
#         else:
#             self.embed = nn.Embedding(self.n_vocab, self.embed_dim, padding_idx=self.padding_idx)
        self.proj = nn.Linear(self.embed_dim, proj_dim)
        self.proj_act = nn.ReLU()
        self.gru = nn.GRU(proj_dim, rnn_dim, self.n_layers,
                          batch_first=True, bidirectional=bidirectional)
        self.pooling = GlobalMaxPooling1D()
        in_dim = 2 * rnn_dim if self.bidirectional else rnn_dim
        self.dense = nn.Linear(in_dim, self.dense_dim)
        self.dense_act = nn.ReLU()
        self.out_linear = nn.Linear(self.dense_dim, n_out)
        self.init_weights()

    def init_weights(self):
        for name, param in self.named_parameters():
            if name.find('embed') > -1:
                continue
            elif name.find('weight') > -1 and len(param.size()) > 1:
                nn.init.xavier_uniform_(param)

    def forward(self, inputs, mask=None):
        # inputs: (bs, max_len)
        x = self.embed(inputs)
        x = self.proj_act(self.proj(x))
        x, hidden = self.gru(x)
        x = self.pooling(x)
        x = self.dense_act(self.dense(x))
        x = self.out_linear(x)
        return x

In [18]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, heads=3, seq_len=50, dropout=0.2):
        super(SelfAttention, self).__init__()
        assert hidden_size % heads == 0
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        
        self.maxpool = nn.MaxPool2d(kernel_size=(seq_len, 1))
        self.out_layer = nn.Linear(hidden_size, 1)
        
        self.hidden_size = hidden_size
        self.heads = heads
        self.attn_size = int(hidden_size / heads)
    
    def transpose_for_scores(self, x, layer):
        x = layer(x)
        new_shape = x.size()[:-1] + (self.heads, self.attn_size)
        x = x.view(*new_shape).permute(0, 2, 1, 3)
        return x
        
    def forward(self, hidden_stations, attention_mask):
        # (batch_size, seq_len, hidden_size)
        
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
        hidden_shape = hidden_stations.size()
        # query: (batch_size, heads, seq_len, attn_size)
        query = self.transpose_for_scores(hidden_stations, self.query)
        key = self.transpose_for_scores(hidden_stations, self.key)
        value = self.transpose_for_scores(hidden_stations, self.value)
        
        # (batch_size, heads, query_len, key_len)
        attention_weight = torch.matmul(query, key.transpose(-1,-2)) / math.sqrt(self.attn_size)
        attention_weight = attention_weight.masked_fill_(attention_mask, -1e9)
        attention_weight = nn.Softmax(dim=-1)(attention_weight)
        
        attention_weith = self.dropout(attention_weight)
        
        # (batch_size, heads, query_len, attn_size)
        context = torch.matmul(attention_weight, value)
        context = context.permute(0,2,1,3).contiguous()
        context = context.view(*hidden_shape)
        
        out = torch.relu(context)
             
#         context = context.unsqueeze(1)
#         context = self.maxpool(context).squeeze()
#         out = self.out_layer(context)
        return  out

class CNN(nn.Module):
    def __init__(self, hidden_size, filters, seq_len, dropout=0.2):
        super(CNN, self).__init__()
        self.layer_1 = nn.Conv2d(1, filters, kernel_size=(1, hidden_size))
#         self.layer_2 = nn.Conv2d(filters, filters, kernel_size=(seq_len, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(seq_len, 1))
        self.dense = nn.Linear(filters, 1)
        self.dropout = nn.Dropout(dropout)
        self.filters = filters
        
    def forward(self, x, mask):
        # (batch_size, 1, seq_len, hidden_size)
        x = x.unsqueeze(1)
        cnn_mask = mask.unsqueeze(1).unsqueeze(3)
        x = x.masked_fill_(cnn_mask, 0)
        # (batch_size, filters, seq_len, 1)
        conv1 = F.relu(self.layer_1(x))
        # (batch_size, filters, 1, 1)
#         out1 = F.relu(self.layer_2(conv1)).squeeze(2).squeeze(2)
        out = self.maxpool(conv1).squeeze()
#         out = torch.cat([out1, out2], dim=1)
        out = self.dropout(out)
        out = self.dense(out)
        return out
    
class AttnCNNModel(nn.Module):
    def __init__(self, embedding_matrix, layers, freeze=False):
        super(AttnCNNModel, self).__init__()
        self.embed_layer = nn.Embedding.from_pretrained(embedding_matrix, freeze=freeze)
        self.layers = nn.ModuleList(layers)
    
    def forward(self, x, mask):
        out = self.embed_layer(x)
        for layer in self.layers:
            out = layer(out, mask)
        return out
    
def model_build(embedding_matrix, heads=6, max_seq_len=50, cnn_filter=128):
    embedding_size = len(embedding_matrix[0])
    attn_layer = SelfAttention(hidden_size=embedding_size, heads=heads, seq_len=max_seq_len,dropout=0.2)
    cnn_layer = CNN(embedding_size, cnn_filter, max_seq_len, dropout=0.2)
    model = AttnCNNModel(embedding_matrix, [attn_layer,cnn_layer])
#     cnn_layer = CNNLayer(hidden_size=embedding_size, filters=128, seq_len=max_seq_len)
#     model = MixModel(attn_layer, cnn_layer, pre_train_embedding=embedding_matrix,freeze=False)
    return model
    

In [3]:
# train_x = train_x.cuda()
# train_y = train_y.cuda()
# train_mask = train_mask.cuda()
eval_x = train_x[1000000:]
eval_y = train_y[1000000:]
eval_m = train_mask[1000000:]
x = train_x[:1000000]
y = train_y[:1000000]
m = train_mask[:1000000]
train_loader = preprocess.get_dataloader(x, m,y, batch_size=500, training=True)
eval_loder = preprocess.get_dataloader(eval_x, eval_m,batch_size=500, training=False,drop_last=False)

In [28]:
eval_x.shape

torch.Size([306122, 50])

In [25]:
# device = torch.device('cuda') if torch.cuda.is_available() else  torch.device('cpu')
# device = torch.device('cpu')
# model = GRUModel(word2vec_embedding)
model = model_build(word2vec_embedding, )
optimizer = Adam(model.parameters())
model.cuda()
for _ in range(2):
    model.train()
    loss = run_epoch(model, train_loader, optimizer)
    model.eval()
    eval_data(model, eval_loder, y_eval=eval_y, threshold=0.3)

step:100 loss:0.2252112 time:6.2s
step:200 loss:0.1779917 time:12.4s
step:300 loss:0.1611622 time:18.5s
step:400 loss:0.1510682 time:24.6s
step:500 loss:0.1444479 time:30.8s
step:600 loss:0.1398672 time:36.9s
step:700 loss:0.1366521 time:43.2s
step:800 loss:0.1341730 time:49.4s
step:900 loss:0.1319987 time:55.5s
step:1000 loss:0.1301171 time:61.7s
step:1100 loss:0.1281581 time:67.9s
step:1200 loss:0.1267251 time:74.1s
step:1300 loss:0.1254922 time:80.3s
step:1400 loss:0.1244053 time:86.5s
step:1500 loss:0.1232737 time:92.7s
step:1600 loss:0.1221937 time:98.8s
step:1700 loss:0.1215380 time:105.0s
step:1800 loss:0.1206982 time:111.3s
step:1900 loss:0.1200224 time:117.5s
step:2000 loss:0.1192632 time:123.6s
eval f1 socre: 0.6502993409468231
step:100 loss:0.0945010 time:6.4s
step:200 loss:0.0932860 time:12.5s
step:300 loss:0.0932547 time:18.7s
step:400 loss:0.0930474 time:24.9s
step:500 loss:0.0929594 time:31.1s
step:600 loss:0.0931829 time:37.3s
step:700 loss:0.0935115 time:43.6s
step:800

KeyboardInterrupt: 

In [4]:
df = pd.read_csv('submission.csv')

In [9]:
f1_score(eval_y.data.numpy(), df.prediction.values)


0.7059791962882586