# Import dataset in to dataframe

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

# Analyze train and test data

In [3]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
408154,4ff9e60d27ba7abf7ece,"Modi had once said that India is the only original and spiritual home of Hindus. If Rohingya people had been Hindus, would the government accept them?",1
48418,097cd64f2731c6f3d4e9,"Do Polish women randomly sleep with African, Arab, or Asian refugees?",1
646070,7e89ea25f6f632fed373,Can a Muslim girl in India marry a Hindu boy? Should the Hindu boy be slaughtered or spared?,1
482347,5e73ac8934a7694ec000,My dad likes the hair on my female part shaved into a half-moon. Do guys usually like half-moons?,1
683318,85d4d8749be69f29dd75,"Is it true that Native Americans believe in more than 2 genders, or is this just another lie invented by the SJWs?",1
495196,60f74feb8261e1d14b8b,Why all the European women are grannies?,1
698961,88dc8583721df50ff506,"If many women hate older male virgins, such as over 30 and would never associate with them, would those males most likely end up being virgins for life if each year it becomes more difficult for them to lose virginity?",1
434083,5512b1a6a05254dca613,Is it considered much that I have asked around 5000 question on Quora with previously deleted profiles?,1
666182,827487143b2c7b3ec2e4,Should atheists have to pay more or less in taxes?,1
361949,46f1115ced8e14a6d3f2,How true is it that white people are generally smarter than black people?,1


In [4]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
348135,4437268438ec296bb9ac,Can I pass in 1st year MBBS exams if I have failed in all the internal exams?,0
1156059,e285da9dc6671efb7230,What permission and license is required for manufacturing audio equipments?,0
617206,78dd71f42bf8990ae116,Which are North Slavic languages?,0
759521,94ced78ce12278534f1e,How often do contrarians believe their own point of view?,0
682106,8596d4172bbd3481f454,How much electricity does 1 gallon of diesel produce?,0
1271269,f92300e2de5860085eb2,Where can I find sea food restaurants in Quebec?,0
1302231,ff3b96feebaf2bfc70cb,"Who are Johnny Depp's parents, and how important were they for his education?",0
491784,604d7edd86410cb27e3b,Is there any difference between data generalization and summarization?,0
412742,50de72f05dfb18bc1183,"Prior to devaluing and discarding me (second time around now), did she ever authentically like me, or was it always about supply?",0
716304,8c3902a212c421ef0ff6,Why is the British Government so inyent on leaving Europe when only 37% of the population voted to leave?,0


In [5]:
import numpy as np


target_ratios = train_df.target.value_counts(normalize=True)

print(target_ratios)

target_ratios.plot(kind='bar', title='Ratios (target)')

0    0.937837
1    0.062163
Name: target, dtype: float64


<matplotlib.axes._subplots.AxesSubplot at 0x7f63d63fb390>

In [6]:
print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Average word length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x.split())))))

Average word length of questions in train is 13.
Average word length of questions in test is 13.


In [7]:
print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))

Max word length of questions in train is 134.
Max word length of questions in test is 87.


In [8]:
print('Average character length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x)))))
print('Average character length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x)))))

Average character length of questions in train is 71.
Average character length of questions in test is 70.


In [9]:
print('Max character length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x)))))
print('Max character length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x)))))

Max character length of questions in train is 1017.
Max character length of questions in test is 588.


In [10]:
print('p999 character length of questions in train is {0:.0f}.'.format(np.percentile(train_df['question_text'].apply(lambda x: len(x)), 99.9)))
print('p999 character length of questions in test is {0:.0f}.'.format(np.percentile(test_df['question_text'].apply(lambda x: len(x)), 99.9)))

p999 character length of questions in train is 249.
p999 character length of questions in test is 249.


## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [11]:
X_train = train_df['question_text'].fillna('+++').tolist()
X_val = val_df['question_text'].fillna('+++').tolist()
X_test = test_df['question_text'].fillna('+++').tolist()

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [12]:
%%time 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300
MAX_WORDS = 100000

tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_train = train_df['target']
y_val = val_df['target']

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Shape of X_train: (1044897, 250)
Shape of y_train: (1044897,)
Found 196192 unique tokens.
CPU times: user 54.3 s, sys: 712 ms, total: 55 s
Wall time: 55 s


# Setup Embedding layer

In [13]:
from gensim.models import KeyedVectors
import numpy as np
import os

def loadEmbeddings(path, dimensions, mode='r', encoding=None, errors=None):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    f = open(path, buffering=((2<<16) + 8), mode=mode, encoding=encoding, errors=errors)
    for line in f:
        values = line.split()
        word = ''.join(values[:-dimensions])
        coefs = np.asarray(values[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

def loadEmbeddingsGensim(path, dimensions, binary=True):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    gensim_vecs = KeyedVectors.load_word2vec_format(path, binary=binary)
    for word, vector in zip(gensim_vecs.vocab, gensim_vecs.vectors):
        coefs = np.asarray(vector[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    print('Found %s word vectors.' % len(embeddings))
    return embeddings
    

In [14]:
embeddings_index = {}

In [15]:
%%time

glove_path = os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
embeddings_index['glove'] = loadEmbeddings(glove_path, EMBEDDING_DIM)

Loading embeddings from: ../input/embeddings/glove.840B.300d/glove.840B.300d.txt
Found 2195892 word vectors.
CPU times: user 2min 26s, sys: 4.23 s, total: 2min 30s
Wall time: 2min 30s


In [16]:
%%time

paragram_path = os.path.join('..', 'input', 'embeddings', 'paragram_300_sl999', 'paragram_300_sl999.txt')
embeddings_index['paragram'] = loadEmbeddings(paragram_path, EMBEDDING_DIM, encoding='utf8', errors='ignore')

Loading embeddings from: ../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt
Found 1703663 word vectors.
CPU times: user 1min 56s, sys: 3.49 s, total: 1min 59s
Wall time: 1min 59s


In [17]:
%%time

wiki_path = os.path.join('..', 'input', 'embeddings', 'wiki-news-300d-1M', 'wiki-news-300d-1M.vec')
embeddings_index['wiki'] = loadEmbeddings(wiki_path, EMBEDDING_DIM)

Loading embeddings from: ../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
Found 999995 word vectors.
CPU times: user 1min 4s, sys: 1.85 s, total: 1min 6s
Wall time: 1min 6s


In [18]:
%%time

google_news_path = os.path.join('..', 'input', 'embeddings', 'GoogleNews-vectors-negative300', 'GoogleNews-vectors-negative300.bin')
embeddings_index['google_news'] = loadEmbeddingsGensim(google_news_path, EMBEDDING_DIM)

Loading embeddings from: ../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
Found 3000000 word vectors.
CPU times: user 1min 35s, sys: 4.98 s, total: 1min 40s
Wall time: 1min 40s


In [19]:
def getEmbeddingMatrix(embedding, word_index, dimensions):
    embedding_matrix = np.zeros((len(word_index) + 1, dimensions))
    for word, i in word_index.items():
        embedding_vector = embedding.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [20]:
from keras.layers import Embedding

#input_layers = {}
embedding_layers = {}
for emb_name, emb in embeddings_index.items():
    #input_layers[emb_name] = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedding_matrix = getEmbeddingMatrix(emb, word_index, EMBEDDING_DIM)
    embedding_layers[emb_name] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

# Setup model




In [25]:
%%time 

from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.layers import SpatialDropout1D, Concatenate, Flatten, Reshape
from keras.regularizers import l2
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

#inp_glove = input_layers['glove']
a = embedding_layers['glove'] (inp)
a = Bidirectional(CuDNNGRU(64, return_sequences=True))(a)
a = BatchNormalization()(a)
a = GlobalMaxPool1D()(a)
a = Dense(16, use_bias=False)(a)
a = BatchNormalization()(a)
a = Activation("relu")(a)
a = Dropout(0.1)(a)
'''
a = Dense(1, use_bias=False)(a)
a = BatchNormalization()(a)
out = Activation("sigmoid")(a)
'''

#inp_paragram = input_layers['paragram']
b = embedding_layers['paragram'] (inp)
b = Bidirectional(CuDNNGRU(64, return_sequences=True))(b)
b = BatchNormalization()(b)
b = GlobalMaxPool1D()(b)
b = Dense(16, use_bias=False)(b)
b = BatchNormalization()(b)
b = Activation("relu")(b)
b = Dropout(0.1)(b)


'''
b = Dense(1, use_bias=False)(b)
b = BatchNormalization()(b)
out = Activation("sigmoid")(b)
'''

#inp_wiki = input_layers['wiki']
c = embedding_layers['wiki'] (inp)
c = Bidirectional(CuDNNGRU(64, return_sequences=True))(c)
c = BatchNormalization()(c)
c = GlobalMaxPool1D()(c)
c = Dense(16, use_bias=False)(c)
c = BatchNormalization()(c)
c = Activation("relu")(c)
c = Dropout(0.1)(c)
'''
c = Dense(1, use_bias=False)(c)
c = BatchNormalization()(c)
out = Activation("sigmoid")(c)
'''


#inp_google_news = input_layers['google_news']
d = embedding_layers['google_news'] (inp)
d = Bidirectional(CuDNNGRU(64, return_sequences=True))(d)
d = BatchNormalization()(d)
d = GlobalMaxPool1D()(d)
d = Dense(16, use_bias=False)(d)
d = BatchNormalization()(d)
d = Activation("relu")(d)
d = Dropout(0.1)(d)
'''
d = Dense(1, use_bias=False)(d)
d = BatchNormalization()(d)
out = Activation("sigmoid")(d)
'''



x = Concatenate(axis=1)([a, b, c, d])
x = Dense(64, use_bias=False)(x)
x = BatchNormalization()(x)
#kernel_regularizer=l2(0.01)
x = Activation("relu")(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)



model = Model(inp, out)

print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 250, 300)     58857900    input_4[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 300)     58857900    input_4[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 300)     58857900    input_4[0][0]                    
__________________________________________________________________________________________________
embedding_

# Compile the model

In [26]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Setup f1-score, precision and recall metrics

In [27]:
'''
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support

class Metrics(Callback):


    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
 

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average='binary')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()
'''

'\nfrom keras.callbacks import Callback\nfrom sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support\n\nclass Metrics(Callback):\n\n\n    def on_train_begin(self, logs={}):\n        self.val_f1s = []\n        self.val_recalls = []\n        self.val_precisions = []\n \n\n    def on_epoch_end(self, epoch, logs={}):\n        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()\n        val_targ = self.validation_data[1]\n        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average=\'binary\')\n        self.val_f1s.append(_val_f1)\n        self.val_recalls.append(_val_recall)\n        self.val_precisions.append(_val_precision)\n        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))\n        return\n\nmetrics = Metrics()\n'

# Compute Class Weights

Since there is a significant target inbalance.

In [28]:
'''
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
'''

"\nfrom sklearn.utils import class_weight\n\nclass_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)\n"

# Train the Model

Use model checkpointing to save the model that attains the best validation loss.

In [None]:
%%time 
#model.fit(X_train, y_train, validation_data=(X_val, y_val),
#          epochs=2, batch_size=128, callbacks=[metrics], class_weight=class_weights)

model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=2, batch_size=1024)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2


# Predict validation labels

In [None]:
%%time

pred_val = model.predict([X_val], batch_size=1024, verbose=1)


# Find best threshold

In [None]:
from sklearn.metrics import f1_score

def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)


# Predict test labels

In [None]:
%%time

pred_test = model.predict([X_test], batch_size=1024, verbose=1)


# Prepare submission

In [None]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)