In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['embeddings', 'sample_submission.csv', 'test.csv', 'train.csv']


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

path = '../input'
max_features = 50000
max_len = 70

#train Data 
csv_path_train = os.path.join(path, 'train.csv')
csv_data_train = pd.read_csv(csv_path_train)
train_data, val_data = train_test_split(csv_data_train, test_size=0.1, random_state=2018)

train_X = train_data["question_text"].fillna("_na_").values
val_X = val_data["question_text"].fillna("_na_").values
train_y = train_data['target'].values
val_y = val_data['target'].values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
word_index = tokenizer.index_word
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
train_X = pad_sequences(train_X, maxlen=max_len)
val_X = pad_sequences(val_X, maxlen=max_len)        

#test_Data
csv_path_test = os.path.join(path, 'test.csv')
csv_data_test = pd.read_csv(csv_path_test)

test_X = csv_data_test["question_text"].fillna("_na_").values
test_X = tokenizer.texts_to_sequences(test_X)
test_X = pad_sequences(test_X, maxlen=max_len)

test_qid = csv_data_test['qid']

del csv_data_train
del csv_data_test

print(train_X.shape, val_X.shape, test_X.shape, train_y.shape, val_y.shape, test_qid.shape, len(word_index))

Using TensorFlow backend.


(1175509, 70) (130613, 70) (56370, 70) (1175509,) (130613,) (56370,) 209286


In [3]:
from tqdm import tqdm

print(os.listdir("../input/embeddings/glove.840B.300d"))
embed_size = 300
def create_embed_matrix():
    embedding_file_path = os.path.join(path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
    
    embed_dict = {}
    with open(embedding_file_path) as file:
        for o in tqdm(file):
            split_o = o.split(" ")
            embed_dict[split_o[0]] = np.array(split_o[1:], dtype=np.float32)
        
    all_embeds = np.stack(embed_dict.values())
    embed_mean,embed_std = all_embeds.mean(), all_embeds.std()
    
    embed_matrix = np.random.uniform(embed_mean, embed_std, size=(max_features, embed_size))
    
    for key, word in tqdm(word_index.items()):
        if key >= max_features: continue
        embed_vector = embed_dict.get(word)
        if embed_vector is not None: 
            embed_matrix[key] = embed_vector
    
    return embed_matrix

embed_matrix = create_embed_matrix()
print(embed_matrix.shape)

2474it [00:00, 12435.29it/s]

['glove.840B.300d.txt']


2196017it [03:08, 11651.55it/s]
100%|██████████| 209286/209286 [00:00<00:00, 911846.51it/s]


(50000, 300)


In [6]:
from keras import backend as K
from tensorflow.keras.backend import batch_dot
from keras.layers import Dense
from keras.activations import softmax
from keras.engine.topology import Layer

class Position_Embeddding(Layer):

    def __init__(self, size=None, **kwargs):

       """
       "positional encodings" to the input embeddings at the
       bottoms of the encoder and decoder stacks.

       """
       self.size = size
       super(Position_Embeddding, self).__init__(**kwargs)

    def call(self, x):

        if self.size == None:
            self.size = int(x.shape[-1]) #embed_size
        batch_size, seq_len = K.shape(x)[0], K.shape(x)[1]
        position_j = 1. / K.pow(10000., (2*K.arange(self.size / 2, dtype='float32') / self.size))
        position_j = K.expand_dims(position_j, 0)
        position_i = K.cumsum(K.ones_like(x[:,:,0]), 1) -1
        position_i = K.expand_dims(position_i, 2)
        position_ij = K.dot(position_i, position_j)
        position_ij = K.concatenate([K.sin(position_ij), K.cos(position_ij)], axis=2)
        position_embeding = position_ij + x

        return position_embeding #batch_size, max_len, embed_size

    def compute_output_shape(self, input_shape):
        return input_shape


class Attention(Layer):

    def __init__(self, nb_head, size_per_head, **kwargs):
        self.nb_head = nb_head
        self.size_per_head = size_per_head
        self.output_dim = nb_head*size_per_head
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.WQ = self.add_weight(name='WQ',
                                  shape=(input_shape[0][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WK = self.add_weight(name='WK',
                                  shape=(input_shape[1][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        self.WV = self.add_weight(name='WV',
                                  shape=(input_shape[2][-1], self.output_dim),
                                  initializer='glorot_uniform',
                                  trainable=True)
        super(Attention, self).build(input_shape)

    def Mask(self, inputs, seq_len, mode='mul'):
        if seq_len == None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape)-2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12

    def call(self, x):
        #如果只传入Q_seq,K_seq,V_seq，那么就不做Mask
        #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len，那么对多余部分做Mask
        if len(x) == 3:
            Q_seq,K_seq,V_seq = x
            Q_len,V_len = None,None
        elif len(x) == 5:
            Q_seq,K_seq,V_seq,Q_len,V_len = x
        #对Q、K、V做线性变换
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
        Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
        K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
        V_seq = K.permute_dimensions(V_seq, (0,2,1,3))

        A = batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5
        A = K.permute_dimensions(A, (0,3,2,1))
        A = self.Mask(A, V_len, 'add')
        A = K.permute_dimensions(A, (0,3,2,1))
        A = K.softmax(A)

        O_seq = batch_dot(A, V_seq, axes=[3,2])
        O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
        O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, 'mul')
        return O_seq

    def compute_output_shape(self, input_shape):
        return (input_shape[0][0], input_shape[0][1], self.output_dim)

In [12]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.layers import Input, Embedding, Dense, Flatten, GlobalAvgPool1D, Dropout
from keras.optimizers import adam
from keras.models import Model

class create_model(object):

    def __init__(self, max_features=50000, max_len=100, embed_size=300, pretrain_embeding_matrix=None):

        self.max_features = max_features
        self.max_len = max_len
        self.embed_size = embed_size
        self.pretrain_embed_matrix = pretrain_embeding_matrix
        self.adam = adam(lr=0.01)

    def get(self):

        inputs = Input(shape=(self.max_len,))

        x = Embedding(self.max_features, self.embed_size, weights=[self.pretrain_embed_matrix])(inputs) # shape: batch_size, max_len, emb_size
        x = Position_Embeddding()(x)
        
        O_seq = Attention(8, 4)([x, x, x])
        O_seq = GlobalAvgPool1D()(O_seq)
        O_seq = Dropout(0.1)(O_seq)
        Outputs = Dense(1, activation='sigmoid')(O_seq)

        model = Model(inputs=inputs, outputs=Outputs)
        model.compile(optimizer=self.adam, loss='binary_crossentropy', metrics=['accuracy'])

        return model

model = create_model(max_len=70, pretrain_embeding_matrix=embed_matrix).get()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 70)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 70, 300)      15000000    input_3[0][0]                    
__________________________________________________________________________________________________
position__embeddding_2 (Positio (None, 70, 300)      0           embedding_3[0][0]                
__________________________________________________________________________________________________
attention_2 (Attention)         (None, 70, 32)       28800       position__embeddding_2[0][0]     
                                                                 position__embeddding_2[0][0]     
          

In [13]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa993317e80>

In [14]:
from sklearn import metrics

pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
print(pred_glove_val_y[pred_glove_val_y==1])
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y > thresh).astype(int))))

[]
F1 score at threshold 0.1 is 0.5735788086228156
F1 score at threshold 0.11 is 0.5827426271112278
F1 score at threshold 0.12 is 0.5903776978417266
F1 score at threshold 0.13 is 0.5975126669737448
F1 score at threshold 0.14 is 0.6039837628622675
F1 score at threshold 0.15 is 0.6076656394453004
F1 score at threshold 0.16 is 0.6125258086717137
F1 score at threshold 0.17 is 0.6171342685370742
F1 score at threshold 0.18 is 0.6199969408045685
F1 score at threshold 0.19 is 0.6218513527521509
F1 score at threshold 0.2 is 0.6241373860822842
F1 score at threshold 0.21 is 0.6256552904675297
F1 score at threshold 0.22 is 0.6263515349089922
F1 score at threshold 0.23 is 0.6268211920529801
F1 score at threshold 0.24 is 0.6267420383948061
F1 score at threshold 0.25 is 0.6263910969793324
F1 score at threshold 0.26 is 0.6256392575992645
F1 score at threshold 0.27 is 0.6255024174287879
F1 score at threshold 0.28 is 0.623848878394333
F1 score at threshold 0.29 is 0.6207845013154747
F1 score at threshol

In [15]:
output = model.predict([test_X], batch_size=1024, verbose=1)
output = (output > 0.35).astype(int)
output = output.flatten()

submission = pd.DataFrame({'qid':test_qid,
                          'prediction':output})

submission.head()
submission.to_csv('submission.csv', index=False)
print(os.listdir('./'))

['submission.csv', '__notebook_source__.ipynb', '.ipynb_checkpoints']
