In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

path = '../input'
max_features = 50000
max_len = 100


#train Data 
csv_path_train = os.path.join(path, 'train.csv')
csv_data_train = pd.read_csv(csv_path_train)
train_data, val_data = train_test_split(csv_data_train, test_size=0.1, random_state=2018)

train_X = train_data["question_text"].fillna("_na_").values
val_X = val_data["question_text"].fillna("_na_").values
train_y = train_data['target'].values
val_y = val_data['target'].values

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
word_index = tokenizer.index_word
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
train_X = pad_sequences(train_X, maxlen=100)
val_X = pad_sequences(val_X, maxlen=100)        

#test_Data
csv_path_test = os.path.join(path, 'test.csv')
csv_data_test = pd.read_csv(csv_path_test)

test_X = csv_data_test["question_text"].fillna("_na_").values
test_X = tokenizer.texts_to_sequences(test_X)
test_X = pad_sequences(test_X, maxlen=100)

test_qid = csv_data_test['qid']

del csv_data_train
del csv_data_test

print(train_X.shape, val_X.shape, test_X.shape, train_y.shape, val_y.shape, test_qid.shape, len(word_index))

In [None]:
from tqdm import tqdm

print(os.listdir("../input/embeddings/glove.840B.300d"))
embed_size = 300
def create_embed_matrix():
    embedding_file_path = os.path.join(path, 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
    
    embed_dict = {}
    with open(embedding_file_path) as file:
        for o in tqdm(file):
            split_o = o.split(" ")
            embed_dict[split_o[0]] = np.array(split_o[1:], dtype=np.float32)
        
    all_embeds = np.stack(embed_dict.values())
    embed_mean,embed_std = all_embeds.mean(), all_embeds.std()
    
    embed_matrix = np.random.uniform(embed_mean, embed_std, size=(max_features, embed_size))
    
    for key, word in tqdm(word_index.items()):
        if key >= max_features: continue
        embed_vector = embed_dict.get(word)
        if embed_vector is not None: 
            embed_matrix[key] = embed_vector
    
    return embed_matrix

embed_matrix = create_embed_matrix()
print(embed_matrix.shape)

In [None]:
#"Convolutional Neural Networks for Sentence Classification"
#https://arxiv.org/pdf/1408.5882.pdf
#"Character-level Convolutional Networks for Text Classification∗"
#https://www.kaggle.com/yekenot/2dcnn-textclassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from keras.layers import Input, Conv1D, Embedding, MaxPool1D, Dense, Dropout, Flatten, Concatenate
from keras.layers.normalization import BatchNormalization
from keras.optimizers import adam
from keras.models import Model

class create_model(object):

    def __init__(self, num_filter, Output_Unit=None, kernel_size=None, max_features=50000, max_len=100, embed_size=300, pretrain_embeding_matrix=None):

        self.max_features = max_features
        self.max_len = max_len
        self.embed_size = embed_size
        self.kernel_size = kernel_size
        self.Output_Unit = Output_Unit
        self.num_filter = num_filter
        self.pretrain_embed_matrix = pretrain_embeding_matrix
        self.adam = adam(lr=0.01)

    def get_chara_level_Net(self, check=False):

        inputs = Input(shape=(self.max_len,))
        x = Embedding(self.max_features, self.embed_size, weights=[self.pretrain_embed_matrix])(inputs)  # shape: batch_size, max_len, emb_size
        net = Conv1D(filters=self.num_filter, kernel_size=(7),
                        kernel_initializer='he_normal', activation='tanh')(x)
        net = MaxPool1D(pool_size=(3))(net)
        net = Conv1D(filters=self.num_filter, kernel_size=(5),
                    kernel_initializer='he_normal', activation='tanh')(net)
        net = MaxPool1D(pool_size=(3))(net)

        net = Flatten()(net)
        net = Dense(self.Output_Unit, activation='tanh', use_bias=True)(net)
        net = Dropout(0.1)(net)
        net = Dense(self.Output_Unit, activation='tanh', use_bias=True)(net)
        outputs = Dense(1, activation='sigmoid', use_bias=False)(net)
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=self.adam, loss='binary_crossentropy', metrics=['accuracy'])

        if check:
            model.summary()

        return model

model = create_model(num_filter=48, Output_Unit=96, pretrain_embeding_matrix=embed_matrix).get_chara_level_Net(check=True)

In [None]:
model.fit(train_X, train_y, batch_size=1024, epochs=3, validation_data=(val_X, val_y))

In [None]:
from sklearn import metrics

pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
print(pred_glove_val_y[pred_glove_val_y==1])
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y > thresh).astype(int))))

In [None]:
output = model.predict([test_X], batch_size=1024, verbose=1)
output = (output > 0.35).astype(int)
output = output.flatten()

submission = pd.DataFrame({'qid':test_qid,
                          'prediction':output})

submission.head()
submission.to_csv('submission_character_level.csv', index=False)
print(os.listdir('./'))

In [None]:
class create_model(object):

    def __init__(self, num_filter, Ouput_Unit=None, kernel_size=None, max_features=50000, max_len=100, embed_size=300, pretrain_embeding_matrix=None):

        self.max_features = max_features
        self.max_len = max_len
        self.embed_size = embed_size
        self.kernel_size = kernel_size
        self.Ouput_Unit = Ouput_Unit
        self.num_filter = num_filter
        self.pretrain_embed_matrix = pretrain_embeding_matrix
        self.adam = adam(lr=0.01)

    def get_cnn(self, check=False):

        """
        A sentence of length n (padded where necessary) is represented as
        x1:n = x1 ⊕ x2 ⊕ . . . ⊕ xn
        where ⊕ is the concatenation operator.

        A convolution operation involves a filter w ∈ R hk, which is applied to a window
        of h words to produce a new feature. For example, a feature ci is generated
        from a window of words xi:i+h−1 by
        ci = f(w · xi:i+h−1 + b).

        This filter is applied to each possible window of words in the sentence.
        The model uses multiple filters, (with varying window sizes)
        loss are passed to a fully connected softmax layer whose output is the probability
        distribution over labels.

        More discussion:
        While we had expected performance gains through the use of pre-trained vectors,
        we were surprised at the magnitude of the gains.

        """

        inputs = Input(shape=(self.max_len,))
        x = Embedding(self.max_features, self.embed_size, weights=[self.pretrain_embed_matrix])(inputs) # shape: batch_size, max_len, emb_size

        #create feature map, to each possible window
        #c = [c1, c2, ..., cn−h + 1]

        conv_0 = Conv1D(filters=self.num_filter, kernel_size=(self.kernel_size[0]),
                        kernel_initializer='he_normal', activation='tanh')(x) # (batch_size, max_len-kernel_size + 1 , num_filter)
        conv_1 = Conv1D(filters=self.num_filter, kernel_size=(self.kernel_size[1]),
                        kernel_initializer='he_normal', activation='tanh')(x) #  (batch_size, max_len-kernel_size + 1 , num_filter)
        conv_2 = Conv1D(filters=self.num_filter, kernel_size=(self.kernel_size[2]),
                        kernel_initializer='he_normal', activation='tanh')(x)
        conv_3 = Conv1D(filters=self.num_filter, kernel_size=(self.kernel_size[3]),
                        kernel_initializer='he_normal', activation='tanh')(x)

        maxpool_0 = MaxPool1D(pool_size=(self.max_len - self.kernel_size[0] + 1))(conv_0)
        maxpool_1 = MaxPool1D(pool_size=(self.max_len - self.kernel_size[1] + 1))(conv_1)
        maxpool_2 = MaxPool1D(pool_size=(self.max_len - self.kernel_size[2] + 1))(conv_2)
        maxpool_3 = MaxPool1D(pool_size=(self.max_len - self.kernel_size[3] + 1))(conv_3)

        # Fully connected layer with dropout and softmax output

        z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
        z = Flatten()(z)
        #dropout on the penultimate layer with a constraint on l2-norms of the weight vectors(Hintonetal., 2012).
        z = Dropout(0.1)(z)
        outputs = Dense(1, activation='sigmoid', use_bias=True)(z)

        model = Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=self.adam, loss='binary_crossentropy', metrics=['accuracy'])

        if check:
            model.summary()

        return model
    
model_cnn = create_model(num_filter=42, kernel_size=[1,3,5,7], pretrain_embeding_matrix=embed_matrix).get_cnn(check=True)

In [None]:
model_cnn.fit(train_X, train_y, batch_size=1024, epochs=2, validation_data=(val_X, val_y))

In [None]:
pred_glove_val_y = model_cnn.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y > thresh).astype(int))))

In [None]:
output = model_cnn.predict([test_X], batch_size=1024, verbose=1)
output = (output > 0.34).astype(int)
output = output.flatten()

submission = pd.DataFrame({'qid':test_qid,
                          'prediction':output})

submission.head()
submission.to_csv('submission.csv', index=False)
print(os.listdir('./'))