In [1]:
import pandas as pd
import numpy as np
import re
import os
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.utils.data import RandomSampler, SequentialSampler, WeightedRandomSampler
from torch.utils.data import DataLoader, TensorDataset

data_path = 'input/'
word2vec_path = data_path+'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# 清理数据
def clean_punct(s):
    puncts = ',.":)(-!?|;\'$&/[]>%=#*+\\•~@£·_{}©^®`<→°€™›♥←×§″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒：¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲è¸¾Ã⋅‘∞∙）↓、│（»，♪╩╚³・╦╣╔╗▬❤ïØ¹≤‡√'
    for punct in puncts:
        s = s.replace(punct, ' ')
    s = re.sub(r'\s+', ' ', s)
    s = s.strip()
    return s

def hash_number(s):
    s = re.sub('[0-9]{5,}', '#####', s)
    s = re.sub('[0-9]{4}', '####', s)
    s = re.sub('[0-9]{3}', '###', s)
    s = re.sub('[0-9]{2}', '##', s)
    return s

def clean_text(texts):
    new_texts = []
    for s in texts:
        s = clean_punct(s)
        s = hash_number(s)
        s = s.lower()
        new_texts.append(s)
    return new_texts

In [3]:
# 从已有词向量表中找到对应的单词，并生成子词典（tokenizer）
def get_vocab_by_embed(full_tokenizer, embed_dict):
    word_list = []
    for word in full_tokenizer.word_counts.keys():
        if word in embed_dict:
            word_list.append(word)
    words = ' '.join(word_list)
    sub_vocab = Tokenizer(lower=False)
    sub_vocab.fit_on_texts([words])
    return sub_vocab

# 生成与单词索引匹配的词向量
def get_embedding_matrix(tokenizer, embed_dict):
    vector_size = len(embed_dict['known'])
    embedding_shape = (len(tokenizer.word_index)+1, vector_size)
    embedding_matrix = np.zeros(embedding_shape)
    indexes = []
    for word, index in tokenizer.word_index.items():
        embedding_matrix[index] = embed_dict[word]
        indexes.append(index)
    embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)
    return embedding_matrix

In [15]:
# 将文本转化为tensor
def texts_to_tensor(texts, tokenizer, maxlen=50):
    seqs = tokenizer.texts_to_sequences(texts)
    seqs_padded = pad_sequences(seqs, maxlen=maxlen, padding='post', truncating='pre', value=0)
    seqs_padded = torch.tensor(seqs_padded, dtype=torch.int64)
    mask = seqs_padded == 0
    mask = mask
    return seqs_padded, mask

In [5]:
# 构建dataloader，用于torch训练和测试用
def get_dataloader(x, mask, y=None,training=True, batch_size=32, 
                   weights=None, num_samples=None, drop_last=True):
    if y is None:
        data = TensorDataset(x, mask)
    else:
        data = TensorDataset(x, mask,y)
    if training:
        if weights is None:
            sampler = RandomSampler(data)
        else:
            sampler = WeightedRandomSampler(weights=weights, num_samples=num_samples)
    else:
        sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=sampler, shuffle=False, batch_size=batch_size, drop_last=drop_last)
    return dataloader

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
import random
import pandas as pd
import numpy as np
import math
import time

def seed_torch(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, heads=3, dropout=0.2):
        super(SelfAttention, self).__init__()
        assert hidden_size % heads == 0
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        
        self.hidden_size = hidden_size
        self.heads = heads
        self.attn_size = int(hidden_size / heads)
    
    def transpose_for_scores(self, x, layer):
        x = layer(x)
        new_shape = x.size()[:-1] + (self.heads, self.attn_size)
        x = x.view(*new_shape).permute(0, 2, 1, 3)
        return x
        
    def forward(self, hidden_stations, attention_mask):
        # (batch_size, seq_len, hidden_size)
        hidden_shape = hidden_stations.size()
        # query: (batch_size, heads, seq_len, attn_size)
        query = self.transpose_for_scores(hidden_stations, self.query)
        key = self.transpose_for_scores(hidden_stations, self.key)
        value = self.transpose_for_scores(hidden_stations, self.value)
        
        # (batch_size, heads, query_len, key_len)
        attention_weight = torch.matmul(query, key.transpose(-1,-2)) / math.sqrt(self.attn_size)
        attention_weight = attention_weight.masked_fill_(attention_mask, -1e9)
        attention_weight = nn.Softmax(dim=-1)(attention_weight)
        
        attention_weith = self.dropout(attention_weight)
        
        # (batch_size, heads, query_len, attn_size)
        context = torch.matmul(attention_weight, value)
        context = context.permute(0,2,1,3).contiguous()
        context = context.view(*hidden_shape)
        return context

class CNNLayer(nn.Module):
    def __init__(self, hidden_size, filters, seq_len, dropout=0):
        super(CNNLayer, self).__init__()
        self.layer_1 = nn.Conv2d(1, filters, kernel_size=(1, hidden_size))
        self.layer_2 = nn.Conv2d(filters, filters, kernel_size=(seq_len, 1))
        self.maxpool = nn.MaxPool2d(kernel_size=(seq_len, 1))
        self.dense = nn.Linear(2*filters, 1)
        # self.dropout = nn.Dropout(dropout)
        self.filters = filters
        
    def forward(self, x):
        # (batch_size, 1, seq_len, hidden_size)
        x = x.unsqueeze(1)
        # (batch_size, filters, seq_len, 1)
        conv1 = F.relu(self.layer_1(x))
        # (batch_size, filters, 1, 1)
        out1 = F.relu(self.layer_2(conv1)).squeeze(2).squeeze(2)
        out2 = self.maxpool(conv1).squeeze(2).squeeze(2)
        out = torch.cat([out1, out2], dim=1)
        out = self.dense(out)
        return out
    
class MixModel(nn.Module):
    def __init__(self, attn_layer, cnn_layer, pre_train_embedding, freeze=False):
        super(MixModel, self).__init__()
#         self.attn_layer = attn_layer
#         self.cnn_layer = cnn_layer
        self.embed_layer = nn.Embedding.from_pretrained(pre_train_embedding, freeze=freeze)
        self.linear == nn.Linear(300, 1)
#         self.init_weights()
        
    def init_weights(self):
        for name, param in self.named_parameters():
            if name.find('embed') > -1:
                continue
            elif name.find('weight') > -1 and len(param.size()) > 1:
                nn.init.xavier_uniform_(param)
        
    def forward(self, x, mask):
        embed = self.embed_layer(x)
        embed = embed.mean(dim=1)
        out = self.linear(embed)
#         embed = self.embed_layer(x)
#         # (batch, seq_len, hidden_size)
#         attention_mask = mask.unsqueeze(1).unsqueeze(1)
#         attn = self.attn_layer(embed, attention_mask)
#         cnn_mask = mask.unsqueeze(2)
#         cnn_input = attn.masked_fill_(cnn_mask, 0)
#         out = self.cnn_layer(cnn_input)
        return out
    
    def predict(self, dataloader):
        preds = []
        with torch.no_grad():
            for batch in dataloader:
                batch = tuple(t.to(device) for t in batch)
                X_batch, mask= batch
                preds.append(self.forward(X_batch, mask).data.cpu())
        return torch.cat(preds)

    def predict_proba(self, dataloader):
        return torch.sigmoid(self.predict(dataloader)).data.numpy()

def model_build(embedding_matrix, heads=2, max_seq_len=50, cnn_filter=128):
    embedding_size = len(embedding_matrix[0])
    attn_layer = SelfAttention(hidden_size=embedding_size, heads=heads, dropout=0.2)
    cnn_layer = CNNLayer(hidden_size=embedding_size, filters=128, seq_len=max_seq_len)
    model = MixModel(attn_layer, cnn_layer, pre_train_embedding=embedding_matrix,freeze=False)
    return model

In [40]:
def run_epoch(model, dataloader, optimizer, callbacks=None,
              criterion=nn.BCEWithLogitsLoss(), verbose_step=10000):
    t1 = time.time()
    tr_loss = 0
    for step, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        x_batch, m_batch, y_batch = batch
        model.zero_grad()
        outputs = model(x_batch, m_batch)
        loss = criterion(outputs[:, 0], y_batch.float())
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        if callbacks is not None:
            for func in callbacks:
                func.on_batch_end(model)
        if (step + 1) % verbose_step == 0:
            loss_now = tr_loss / (step + 1)
            print(f'step:{step+1} loss:{loss_now:.7f} time:{time.time() - t1:.1f}s')
    if callbacks is not None:
        for func in callbacks:
            func.on_epoch_end(model)
    return tr_loss / (step + 1)

In [8]:
train_df = pd.read_csv(data_path+'train.csv')
train_texts = list(train_df.question_text.values)
train_texts = clean_text(train_texts)


word2vec_dict = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
full_tokenizer = Tokenizer(lower=False)
full_tokenizer.fit_on_texts(train_texts)

word2vec_tokenizer = get_vocab_by_embed(full_tokenizer, word2vec_dict)

word2vec_matrix = get_embedding_matrix(word2vec_tokenizer, word2vec_dict)

train_x, train_mask = texts_to_tensor(train_texts, word2vec_tokenizer)
train_y = torch.tensor(train_df.target.values, dtype=torch.float32)

train_dataloader = get_dataloader(train_x, train_y)

int32


In [9]:
eval_x = train_x[1000000:]
eval_y = train_y[1000000:]
eval_m = train_mask[1000000:]
x = train_x[:1000000]
y = train_y[:1000000]
m = train_mask[:1000000]
train_loader = get_dataloader(x, m,y, batch_size=100, training=True)
eval_loder = get_dataloader(eval_x, eval_m, batch_size=100, training=False)

In [160]:
word2vec_matrix.dtype

torch.float32

In [41]:
device = torch.device('cuda') if torch.cuda.is_available() else  torch.device('cpu')
# device = torch.device('cpu')
model = model_build(word2vec_matrix)
optimizer = Adam(model.parameters())
model.cuda()
for _ in range(1):
    model.train()
    
    loss = run_epoch(model, train_loader, optimizer)
    print(loss)

AttributeError: 'MixModel' object has no attribute 'linear'

In [38]:
model

MixModel(
  (attn_layer): SelfAttention(
    (query): Linear(in_features=10, out_features=10, bias=True)
    (key): Linear(in_features=10, out_features=10, bias=True)
    (value): Linear(in_features=10, out_features=10, bias=True)
    (dropout): Dropout(p=0.2)
  )
  (cnn_layer): CNNLayer(
    (layer_1): Conv2d(1, 128, kernel_size=(1, 10), stride=(1, 1))
    (layer_2): Conv2d(128, 128, kernel_size=(50, 1), stride=(1, 1))
    (maxpool): MaxPool2d(kernel_size=(50, 1), stride=(50, 1), padding=0, dilation=1, ceil_mode=False)
    (dense): Linear(in_features=256, out_features=1, bias=True)
  )
  (embed_layer): Embedding(76655, 10)
)

In [20]:
class test_model(nn.Module):
    def __init__(self, pre_train_embedding, freeze=False):
        super(test_model, self).__init__()
        self.embed_layer = nn.Embedding.from_pretrained(pre_train_embedding, freeze=freeze)
        self.linear = nn.Linear(300, 1)
        self.init_weights()
        
    def init_weights(self):
        for name, param in self.named_parameters():
            if name.find('embed') > -1:
                continue
            elif name.find('weight') > -1 and len(param.size()) > 1:
                nn.init.xavier_uniform_(param)
        
    def forward(self, x, mask):
        embed = self.embed_layer(x)
        out = self.linear(embed)
        return out

In [146]:
for step, batch in enumerate(train_loader):
#     batch = tuple(t.to(device) for t in batch)
    x_batch, m_batch, y_batch = batch
#     model.zero_grad()
#     outputs = model(x_batch, m_batch)
    break

In [140]:
torch.randn(2,4,300).dtype

torch.float32