In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['embeddings', 'sample_submission.csv', 'test.csv', 'train.csv']


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

path = '../input'
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
max_len = 1024

#train Data 
csv_path_train = os.path.join(path, 'train.csv')
csv_data_train = pd.read_csv(csv_path_train)
train_data, val_data = train_test_split(csv_data_train, test_size=0.1, random_state=2018)

train_X = train_data["question_text"].fillna("_na_").values
val_X = val_data["question_text"].fillna("_na_").values
train_y = train_data['target'].values
val_y = val_data['target'].values

#char_level: Trueなら，全文字はトークンとして扱われる
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(list(train_X))

char_list = {}
for i, char in enumerate(alphabet):
    char_list[char] = i + 1

tk.word_index = char_list.copy()
tk.word_index[tk.oov_token] = max(char_list.values()) + 1

train_X = tk.texts_to_sequences(train_X)
val_X = tk.texts_to_sequences(val_X)
train_X = pad_sequences(train_X, maxlen=max_len)
val_X = pad_sequences(val_X, maxlen=max_len)

csv_path_test = os.path.join(path, 'test.csv')
csv_data_test = pd.read_csv(csv_path_test)
test_X = csv_data_test["question_text"].fillna("_na_").values
test_X = tk.texts_to_sequences(test_X)
test_X = pad_sequences(test_X, maxlen=max_len)

test_qid = csv_data_test['qid']

del csv_data_train
del csv_data_test

print(train_X.shape, val_X.shape, test_X.shape, train_y.shape, val_y.shape, test_qid.shape)

Using TensorFlow backend.


(1175509, 1024) (130613, 1024) (56370, 1024) (1175509,) (130613,) (56370,)


In [3]:
#and then quantize each character using 1-of-m encoding
#(or “one-hot” encoding)

embedding_weights = np.identity(len(tk.word_index))
pad_weights = np.zeros(len(tk.word_index))

embedding_weights = np.vstack((pad_weights, embedding_weights))
print(embedding_weights.shape)

vocab_size = len(tk.word_index)

(70, 69)


In [9]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.layers import Input, Conv1D, Embedding, MaxPool1D, Dense, Dropout, Flatten, Concatenate
from keras.optimizers import adam
from keras.models import Model
from keras.initializers import RandomNormal

class char_cnn(object):

    def __init__(self, num_filter, num_output, max_features=70, max_len=1024, embed_size=69, pretrain_embeding_matrix=None):

        self.max_features = max_features
        self.max_len = max_len
        self.embed_size = embed_size
        self.num_filter = num_filter
        self.num_output = num_output
        self.pretrain_embed_matrix = pretrain_embeding_matrix
        self.adam = adam(lr=0.01, decay=1e-6)
        self.initializers = RandomNormal(mean=0.0, stddev=0.05)
        
    def get(self):
        
        inputs = Input(shape=(self.max_len,))
        x = Embedding(self.max_features, self.embed_size,weights=[self.pretrain_embed_matrix])(inputs) # shape: batch_size, max_len, emb_size

        net = Conv1D(filters=self.num_filter, kernel_size=(7),
                     kernel_initializer=self.initializers, activation='tanh')(x) # (batch_size, max_len-kernel_size + 1 , num_filter)
        net = MaxPool1D(pool_size=(3))(net)
        net = Conv1D(filters=self.num_filter, kernel_size=(7),
                     kernel_initializer=self.initializers, activation='tanh')(net) # (batch_size, max_len-kernel_size + 1 , num_filter)
        net = MaxPool1D(pool_size=(3))(net)
        net = Conv1D(filters=self.num_filter, kernel_size=(3),
                     kernel_initializer=self.initializers, activation='tanh')(net) # (batch_size, max_len-kernel_size + 1 , num_filter)
        net = Conv1D(filters=self.num_filter, kernel_size=(3),
                     kernel_initializer=self.initializers, activation='tanh')(net) # (batch_size, max_len-kernel_size + 1 , num_filter)
        net = Conv1D(filters=self.num_filter, kernel_size=(3),
                     kernel_initializer=self.initializers, activation='tanh')(net) # (batch_size, max_len-kernel_size + 1 , num_filter)
        net = Conv1D(filters=self.num_filter, kernel_size=(3),
                     kernel_initializer=self.initializers, activation='tanh')(net) # (batch_size, max_len-kernel_size + 1 , num_filter)
        net = MaxPool1D(pool_size=(3))(net)
        net = Flatten()(net)
        
        #dropout on the penultimate layer with a constraint on l2-norms of the weight vectors(Hintonetal., 2012).
        net = Dense(self.num_output, activation='tanh')(net)
        net = Dropout(0.1)(net)
        net = Dense(self.num_output, activation='tanh')(net)
        net = Dropout(0.1)(net)
        outputs = Dense(1, activation='sigmoid')(net)
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=self.adam, loss='binary_crossentropy', metrics=['accuracy'])

        return model
        
model = char_cnn(num_filter=42, num_output=96, pretrain_embeding_matrix=embedding_weights).get()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 1024)              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 1024, 69)          4830      
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 1018, 42)          20328     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 339, 42)           0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 333, 42)           12390     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 111, 42)           0         
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 109, 42)           5334      
__________

In [10]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1da0316d68>

In [12]:
from sklearn import metrics

pred_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))))

F1 score at threshold 0.1 is 0.0
F1 score at threshold 0.11 is 0.0
F1 score at threshold 0.12 is 0.0
F1 score at threshold 0.13 is 0.0
F1 score at threshold 0.14 is 0.0
F1 score at threshold 0.15 is 0.0
F1 score at threshold 0.16 is 0.0
F1 score at threshold 0.17 is 0.0
F1 score at threshold 0.18 is 0.0
F1 score at threshold 0.19 is 0.0
F1 score at threshold 0.2 is 0.0
F1 score at threshold 0.21 is 0.0
F1 score at threshold 0.22 is 0.0
F1 score at threshold 0.23 is 0.0
F1 score at threshold 0.24 is 0.0
F1 score at threshold 0.25 is 0.0
F1 score at threshold 0.26 is 0.0
F1 score at threshold 0.27 is 0.0
F1 score at threshold 0.28 is 0.0
F1 score at threshold 0.29 is 0.0
F1 score at threshold 0.3 is 0.0
F1 score at threshold 0.31 is 0.0
F1 score at threshold 0.32 is 0.0
F1 score at threshold 0.33 is 0.0
F1 score at threshold 0.34 is 0.0
F1 score at threshold 0.35 is 0.0
F1 score at threshold 0.36 is 0.0
F1 score at threshold 0.37 is 0.0
F1 score at threshold 0.38 is 0.0
F1 score at thres