In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence
from keras.utils import Sequence
from keras.layers import *
from keras.models import *
from keras.optimizers import Adam
from keras.callbacks import *
from tqdm import tqdm
from sklearn import metrics
import pickle

In [None]:
path = '../input/'
max_features = 50000
max_len = 100
max_word_len = 30
embed_size = 300
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
w_index = 'word_index.pickle'
char_index = 'char_index.pickle'

train_data = pd.read_csv(os.path.join(path, 'train.csv'))
test_data = pd.read_csv(os.path.join(path, 'test.csv'))

In [None]:
##################################
##Prepare data for train/test
##################################

train, val= train_test_split(train_data, test_size=0.2, random_state=2018)

tr_X = train["question_text"].fillna("_na_").values
v_X = val["question_text"].fillna("_na_").values
tr_y = train['target'].values
v_y = val['target'].values
te_X = test_data["question_text"].fillna("_na_").values

char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(tr_X))
word_index = tokenizer.index_word

with open(w_index, mode='wb') as f:
    pickle.dump(word_index, f)

with open(char_index, mode='wb') as f:
     pickle.dump(char_dict, f)

In [None]:
##################################
##create input for train/test
##################################

class input_creater(Sequence):
    
    def __init__(self, x, y=None, batch_size=1024):
        super(input_creater, self).__init__()
        self.x = x
        self.y = y
        self.batch_size = batch_size
        
    def __getitem__(self, index):
        start = self.batch_size * index
        end = min(start + self.batch_size, len(self.x))
        size  = end - start
        inp2 = np.zeros((size, max_len, max_word_len))
        batch_x = self.x[start:end]
        
        inp1 = tokenizer.texts_to_sequences(batch_x)
        inp1 = pad_sequences(inp1, maxlen=max_len, padding='post')
        
        seqs = [text_to_word_sequence(s) for s in batch_x]
        for i, s in enumerate(seqs): 
            m = []
            for word in s: 
                a = []
                for char in word:
                    if char in char_dict.keys():
                        a.append(char_dict[char])   
                m.append(a)
                if len(m) >= 100: break
            m = pad_sequences(m, maxlen=max_word_len, padding='post')
            for n in range(m.shape[0]):
                inp2[i, n, :] = m[n,:]
                
        if self.y is not None:
            batch_y = self.y[start:end]
            return [inp1,inp2], batch_y
        else:
            return [inp1,inp2]

    def __len__(self):
        return (len(self.x) + self.batch_size - 1) // self.batch_size

In [None]:
from tqdm import tqdm

def create_embed_matrix():
    embedding_file_path = os.path.join(path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
    
    embed_dict = {}
    with open(embedding_file_path) as file:
        for o in tqdm(file):
            split_o = o.split(" ")
            embed_dict[split_o[0]] = np.array(split_o[1:], dtype=np.float32)
        
    all_embeds = np.stack(embed_dict.values())
    embed_mean,embed_std = all_embeds.mean(), all_embeds.std()
    
    embed_matrix = np.random.uniform(embed_mean, embed_std, size=(max_features, embed_size))
    
    for key, word in tqdm(word_index.items()):
        if key >= max_features: continue
        embed_vector = embed_dict.get(word)
        if embed_vector is not None: 
            embed_matrix[key] = embed_vector
    
    return embed_matrix

embed_matrix = create_embed_matrix()
print(embed_matrix.shape)

In [None]:
class Char_embed(object):

    def __init__(self, filters,
                 kernel, max_features,
                 max_word_len,embed_size=30,
                 highway=True):

        self.filters = filters
        self.kernel = kernel
        self.max_features = max_features
        self.max_word_len = max_word_len
        self.embed_size = embed_size
        self.highway = highway

    def build(self):

        inputs = Input(shape=(self.max_word_len,))

        #Embedding
        x = Embedding(self.max_features, self.embed_size, input_shape=(self.max_word_len,))(inputs) # shape: batch_size, max_len, emb_size

        feature_maps = []

        for i in range(len(self.kernel)):
            conv = Conv1D(filters=self.filters[i], kernel_size=(self.kernel[i]),
                          activation='tanh', name='conv_{}'.format(i))(x) # (batch_size, max_len-kernel_size + 1, num_filter)

            feature_maps.append(conv)

        max_pools=[]
        #Max over time pooling layer

        for i in range(len(feature_maps)):
            max_pool = GlobalMaxPooling1D(name='Maxovertimepoolinglayer_{}'.format(i))(feature_maps[i])
            max_pools.append(max_pool)

        #High way
        feature_vectors = Concatenate(axis=1)(max_pools)

        transform_gate = Dense(sum(self.filters), activation='sigmoid',name='transform_gate', use_bias=True)(feature_vectors)
        carry_gate = Lambda(lambda x: 1-x, name='carry_gate')(transform_gate)

        z = Dense(sum(self.filters), activation='relu')(feature_vectors)
        z = add([multiply([z, transform_gate]), multiply([carry_gate, feature_vectors])])

        model = Model(inputs=inputs, outputs=z)

        return model

char_embed = Char_embed(filters=[16, 16, 32, 32, 64, 64],
                    kernel=[1, 2, 3, 4, 5, 6],
                    max_features=len(list(char_dict.keys()))+1,
                    max_word_len=max_word_len
                   ).build()
char_embed.summary()

In [None]:
class main(object):

    def __init__(self, filters, kernels, max_features, 
                 max_len, max_word_len, 
                 pretrain_embed_matrix=None, embed_size=300):
        
        self.filters = filters
        self.kernels = kernels
        self.num_unit = 64
        self.output_dim = 16
        self.max_features = max_features
        self.max_len = max_len
        self.max_word_len = max_word_len
        self.pretrain_embed_matrix = pretrain_embed_matrix
        self.embed_size = embed_size

    def build(self, charRNN):

        Inp1 = Input(shape=(self.max_len,))
        Inp2 = Input(shape=(self.max_len,self.max_word_len))
        
        #x1= Embedding(self.max_features, self.embed_size, weights=[self.pretrain_embed_matrix])(Inp1)
        x1 = Embedding(self.max_features, self.embed_size, weights=[self.pretrain_embed_matrix])(Inp1)
        x2 = TimeDistributed(charRNN, input_shape=(self.max_len, self.max_word_len))(Inp2)
        x = Concatenate()([x1, x2])
        
        #conv_0 = Conv1D(filters=self.filters, kernel_size=(self.kernels[0]),activation='tanh')(x) # (batch_size, max_len-kernel_size + 1 , num_filter)
        #conv_1 = Conv1D(filters=self.filters, kernel_size=(self.kernels[1]),activation='tanh')(x) #  (batch_size, max_len-kernel_size + 1 , num_filter)
        #conv_2 = Conv1D(filters=self.filters, kernel_size=(self.kernels[2]),activation='tanh')(x)
        #conv_3 = Conv1D(filters=self.filters, kernel_size=(self.kernels[3]),activation='tanh')(x)

        #maxpool_0 = MaxPool1D(pool_size=(self.max_len - self.kernels[0] + 1))(conv_0)
        #maxpool_1 = MaxPool1D(pool_size=(self.max_len - self.kernels[1] + 1))(conv_1)
        #maxpool_2 = MaxPool1D(pool_size=(self.max_len - self.kernels[2] + 1))(conv_2)
        #maxpool_3 = MaxPool1D(pool_size=(self.max_len - self.kernels[3] + 1))(conv_3)

        # Fully connected layer with dropout and softmax output
        #x= Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
        #x = Flatten()(x)
        #dropout on the penultimate layer with a constraint on l2-norms of the weight vectors(Hintonetal., 2012).
        #x = Dropout(0.5)(x)
        #x = Dense(1, activation='sigmoid', use_bias=True)(x)
        
        x = Bidirectional(LSTM(self.num_unit, return_sequences=True))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(self.output_dim)(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation='sigmoid', use_bias=True)(x)
        
        model = Model([Inp1, Inp2], x) 

        return model
    
model = main(filters=32, kernels=[1,5,7,10], 
             max_features=max_features, max_len=max_len, 
             max_word_len=max_word_len, pretrain_embed_matrix=embed_matrix).build(char_embed)
model.summary()

In [None]:
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
callbacks=[EarlyStopping(monitor='val_loss', patience=1, min_delta=0.1, verbose=1),
            ReduceLROnPlateau(monitor='val_loss', patience=1, min_delta=0.1, factor=0.25, min_lr=0.0001, verbose=1),
            ModelCheckpoint('TagLM_CNN.h5', save_best_only=True, save_weights_only=True)]
model.fit_generator(input_creater(tr_X, tr_y, batch_size=800), verbose=1, epochs=3,
                    validation_data=input_creater(v_X, v_y, batch_size=800), callbacks=callbacks)

In [None]:
pred_v_y = model.predict_generator(input_creater(v_X, batch_size=700), verbose=1)

best_thresh = 0.1
best_score = 0.0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    score = metrics.f1_score(v_y, (pred_v_y > thresh).astype(int))
    if score > best_score:
        best_score = score
        best_thresh = thresh
    print("F1 score at threshold {0} is {1}".format(thresh, score))
print(best_thresh)

In [None]:
output = model.predict_generator(input_creater(te_X, batch_size=700), verbose=1)
output = (output > best_thresh).astype(int)
output = output.flatten()

submission = pd.DataFrame({'qid':test_data.qid.values,
                          'prediction':output})

print(submission.head())
submission.to_csv('submission.csv', index=False)
print(os.listdir('./'))