# Import dataset in to dataframe

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

# Analyze train and test data

In [2]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
750793,9317f87a6f9ea9ea604c,People don’t love me they are a bunch of fake personality’s there’s a lot of them out there do you agre?,1
1245569,f4187132c2dba949a8c7,ANONYMOUS: Why do liberals resort to making death threats when losing a debate on the internet?,1
540853,69f86ce0f8e6d3f1d452,Can you get pregnant with a dog?,1
368899,4853b0ce177431d3d0ce,Why can religions totally brainwash to make you believe what is so obviously not true and control every thought action and even hurt loved ones why don’t they see reality?,1
1044793,ccb9aa86715e61496586,"The smartest people in a university are generally the mathematicians and physicists. Who, then, are on the other end of the scale?",1
967080,bd788652c748007de709,Is sucking your own penis considered being gay?,1
429196,541c28bbbd706c1dc995,Why is patriarchy replacing women with trans women? Why is self-ID happening now? What is the agenda?,1
253525,319e3c53722affeae615,Forty-nine percent of Democrats approve of the meeting between Mr. Trump and Kim. Why are there almost no liberals on Quora who approve of the meeting?,1
1033676,ca8c8cf9cd824fe3ef78,Pakistan is great country .blessing of allah pak.no enemy can finish.pak is lovly country.India is only show and lier in front of world?,1
351215,44d66ddf7251d30384bd,Are the Chinese good at everything?,1


In [3]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
469325,5be6f25648f53a85b095,"If I look at a white wall, I see millions of multicoloured dits. Why does this happen to me?",0
1075463,d2bd329916f25421a685,"If you snatch a gun away from an attacker, are you still covered by self defense laws if you shoot them (USA)?",0
807860,9e4bfc75f423e3b40963,"Why does viral things can ruin lives, and why some people can coupe with it?",0
122965,181202dd9751c06abeba,How do I fix my Windows 10 display to have the scale to be normal? Because it zoomed in for some reason and I didn't press any button. And it still says the scale is 100%. How would I fix this?,0
154978,1e508970fdd54100896a,How many pounds are in a fuckton?,0
962962,bcaba6c9b6b465c2790b,What is the subject seed in Tamil name variety?,0
616660,78c29d96f6349ce4f863,Is it a good time to enter mining as all new upcoming coins are moving to proof of stake?,0
1305397,ffdd7f950778fb7e688a,What is the difference between demonization and dehumanizing? What's the difference between romanticizing and fetishizing? What is the difference between stigma and stereotypes?,0
361176,46cafc71513ed35d6f18,How do I reduce face fat and belly fat while trying to have a stiff body?,0
651694,7fa578d8089ecf836b27,Why are me and my ex still so close after our breakup?,0


In [4]:
import numpy as np


target_ratios = train_df.target.value_counts(normalize=True)

print(target_ratios)

target_ratios.plot(kind='bar', title='Ratios (target)')

0    0.937837
1    0.062163
Name: target, dtype: float64


<matplotlib.axes._subplots.AxesSubplot at 0x7f4bf4603cc0>

In [5]:
print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Average word length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x.split())))))

Average word length of questions in train is 13.
Average word length of questions in test is 13.


In [6]:
print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))

Max word length of questions in train is 134.
Max word length of questions in test is 87.


In [7]:
print('Average character length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x)))))
print('Average character length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x)))))

Average character length of questions in train is 71.
Average character length of questions in test is 70.


In [8]:
print('Max character length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x)))))
print('Max character length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x)))))

Max character length of questions in train is 1017.
Max character length of questions in test is 588.


In [9]:
print('p999 character length of questions in train is {0:.0f}.'.format(np.percentile(train_df['question_text'].apply(lambda x: len(x)), 99.9)))
print('p999 character length of questions in test is {0:.0f}.'.format(np.percentile(test_df['question_text'].apply(lambda x: len(x)), 99.9)))

p999 character length of questions in train is 249.
p999 character length of questions in test is 249.


## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [10]:
X_train = train_df['question_text'].fillna('+++').tolist()
X_val = val_df['question_text'].fillna('+++').tolist()
X_test = test_df['question_text'].fillna('+++').tolist()

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [11]:
%%time 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300
MAX_WORDS = 100000

tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_train = train_df['target']
y_val = val_df['target']

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Shape of X_train: (1044897, 250)
Shape of y_train: (1044897,)
Found 196192 unique tokens.
CPU times: user 55.5 s, sys: 784 ms, total: 56.3 s
Wall time: 56.4 s


# Setup Embedding layer

In [15]:
from gensim.models import KeyedVectors
import numpy as np
import os

def loadEmbeddings(path, dimensions, mode='r', encoding=None, errors=None):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    f = open(path, buffering=((2<<16) + 8), mode=mode, encoding=encoding, errors=errors)
    for line in f:
        values = line.split()
        word = ''.join(values[:-dimensions])
        coefs = np.asarray(values[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

def loadEmbeddingsGensim(path, dimensions, binary=True):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    gensim_vecs = KeyedVectors.load_word2vec_format(path, binary=binary)
    for word, vector in zip(gensim_vecs.vocab, gensim_vecs.vectors):
        coefs = np.asarray(vector[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    print('Found %s word vectors.' % len(embeddings))
    return embeddings
    

In [16]:
embeddings_index = {}

In [117]:
%%time

glove_path = os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
embeddings_index['glove'] = loadEmbeddings(glove_path, EMBEDDING_DIM)

Loading embeddings from: ../input/embeddings/glove.840B.300d/glove.840B.300d.txt
Found 2195892 word vectors.
CPU times: user 2min 25s, sys: 4.09 s, total: 2min 29s
Wall time: 2min 29s


In [114]:
%%time

paragram_path = os.path.join('..', 'input', 'embeddings', 'paragram_300_sl999', 'paragram_300_sl999.txt')
embeddings_index['paragram'] = loadEmbeddings(paragram_path, EMBEDDING_DIM, encoding='utf8', errors='ignore')

Loading embeddings from: ../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt
Found 1703663 word vectors.
CPU times: user 1min 53s, sys: 1.97 s, total: 1min 55s
Wall time: 1min 55s


In [115]:
%%time

wiki_path = os.path.join('..', 'input', 'embeddings', 'wiki-news-300d-1M', 'wiki-news-300d-1M.vec')
embeddings_index['wiki'] = loadEmbeddings(wiki_path, EMBEDDING_DIM)

Loading embeddings from: ../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
Found 999995 word vectors.
CPU times: user 1min 4s, sys: 1.7 s, total: 1min 6s
Wall time: 1min 6s


In [17]:
%%time

google_news_path = os.path.join('..', 'input', 'embeddings', 'GoogleNews-vectors-negative300', 'GoogleNews-vectors-negative300.bin')
embeddings_index['google_news'] = loadEmbeddingsGensim(google_news_path, EMBEDDING_DIM)

Loading embeddings from: ../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
Found 3000000 word vectors.
CPU times: user 1min 38s, sys: 4.79 s, total: 1min 42s
Wall time: 1min 42s


In [None]:
import sys

sys.getsizeof(embeddings_index)


In [78]:
EMBEDDING_DIM = 300

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [79]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

# Setup model




In [80]:
%%time

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

# train a 1D convnet with global maxpooling
inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(inp)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inp, out)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 250)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 250, 300)          58857900  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 246, 128)          192128    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 49, 128)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 45, 128)           82048     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 9, 128)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 5, 128)            82048     
__________

In [81]:
%%time 

from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = BatchNormalization()(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, use_bias=False)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)


model = Model(inp, out)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 250)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 250, 300)          58857900  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 250, 128)          140544    
_________________________________________________________________
batch_normalization_7 (Batch (None, 250, 128)          512       
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                2048      
_________________________________________________________________
batch_normalization_8 (Batch (None, 16)                64        
__________

# Compile the model

In [82]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Setup f1-score, precision and recall metrics

In [83]:
'''
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support

class Metrics(Callback):


    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
 

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average='binary')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()
'''

'\nfrom keras.callbacks import Callback\nfrom sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support\n\nclass Metrics(Callback):\n\n\n    def on_train_begin(self, logs={}):\n        self.val_f1s = []\n        self.val_recalls = []\n        self.val_precisions = []\n \n\n    def on_epoch_end(self, epoch, logs={}):\n        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()\n        val_targ = self.validation_data[1]\n        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average=\'binary\')\n        self.val_f1s.append(_val_f1)\n        self.val_recalls.append(_val_recall)\n        self.val_precisions.append(_val_precision)\n        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))\n        return\n\nmetrics = Metrics()\n'

# Compute Class Weights

Since there is a significant target inbalance.

In [84]:
'''
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
'''

"\nfrom sklearn.utils import class_weight\n\nclass_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)\n"

# Train the Model

Use model checkpointing to save the model that attains the best validation loss.

In [85]:
%%time 
#model.fit(X_train, y_train, validation_data=(X_val, y_val),
#          epochs=2, batch_size=128, callbacks=[metrics], class_weight=class_weights)

model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=2, batch_size=1024)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2
Epoch 2/2
CPU times: user 2min 21s, sys: 27.8 s, total: 2min 49s
Wall time: 2min 55s


<keras.callbacks.History at 0x7f929280b6d8>

# Predict validation labels

In [86]:
%%time

pred_val = model.predict([X_val], batch_size=1024, verbose=1)


CPU times: user 6.62 s, sys: 1.48 s, total: 8.11 s
Wall time: 7.12 s


# Find best threshold

In [87]:
from sklearn.metrics import f1_score

def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)


best threshold is 0.2900 with F1 score: 0.6552


# Predict test labels

In [88]:
%%time

pred_test = model.predict([X_test], batch_size=1024, verbose=1)


CPU times: user 1.41 s, sys: 268 ms, total: 1.68 s
Wall time: 1.47 s


# Prepare submission

In [89]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)