# Import dataset in to dataframe

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

In [2]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
376293,49c3c879eeb35b7711b2,Don't democrats realize that the progressive ideology has destroyed the Democratic Party?,1
1232529,f18835e3747e515ac34b,"If there is an endless supply of men, especially online and there are beta men that don't mind accepting even unattractive or obese women, how come some women end up involuntarily single for life?",1
1014656,c6d56def6d129abb3135,Do French people in general have such bigoted views towards Syrian rebels?,1
1107771,d915f251921f68ab864c,Why do Persians believe in 'white Aryan Indo-European' Nazi ideology?,1
1127449,dcf1be6c98286df68999,Are Chinese people answering on Quora paid by the CCP in an attempt to shape people’s opinions about the CCP?,1
854190,a75bffebce2c8c7a9432,When would India become Muslim free?,1
428435,53f8d84a4d646b427c8f,Do atheists honestly believe that Darwin's anti-religious bigoted family would have admitted that he had a deathbed conversion?,1
1186254,e8789e001d6f5e8ccdeb,Why are there so many stupid people on Quora. and why do you stupid teenagers not Google this shit?,1
964524,bcf85db9b95f1b012df2,How are dominant groups carrying out racism in America? And what reasons do the dominant group use to justify their racism?,1
1215250,ee2d665a2a19810e8d52,"Why are liberals viewed as anti-authority when they are always going “there ought to be a law” on this and that, and end up regulating/criminalizing more and more day to day behaviors?",1


In [3]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
244246,2fc50134cb76b8b254af,After bankruptcy do you sill have to pay liabilities?,0
1114457,da61e1f4461d6a03ddfc,Can anyone provide tips for clearing DNB CET MD/MS exam?,0
1179464,e721130f8d3e47a54cad,What are some interesting AI use cases in ERP?,0
291567,391bdc2e29577623066d,Are there any social networking sites that are only for talking about politics?,0
84381,1085a12556be93f8e7c0,How do I earn money through online easily?,0
421290,5291ec9be39f9af6b763,Why is Thomas Paine not given more attention in our educational system?,0
599169,7559db840d7d214a84cf,Was Prophet Muhammad vegetarian?,0
384751,4b6071a64751870c0ef6,How does Facebook recognize videos that contain copyrighted materials and warns you after you posted them?,0
1028969,c9a3081dd44460574b7f,What is Zamalek?,0
406440,4fa3b67539fcd5d25932,What is the process of posting of new PO in Public Sector banks? What are zones and circles in PSBs?,0


In [4]:
import numpy as np


target_count = train_df.target.value_counts(normalize=True)

print(target_count)

0    979943
1    64954 
Name: target, dtype: int64


In [None]:
target_count.plot(kind='bar', title='Count (target)');

## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [5]:
X_train = train_df['question_text'].tolist()
X_val = val_df['question_text'].tolist()
X_test = test_df['question_text'].tolist()

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [6]:
%%time 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 300
MAX_WORDS = 50000

tokenizer = Tokenizer(num_words=None, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_train = train_df['target']
y_val = val_df['target']

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Shape of X_train: (1044897, 1000)
Shape of y_train: (1044897,)
Found 196192 unique tokens.
CPU times: user 1min, sys: 3.24 s, total: 1min 3s
Wall time: 1min 3s


# Setup Embedding layer

In [7]:
%%time

import numpy as np
import os

embeddings_index = {}
f = open(os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt'), buffering=((2<<16) + 8))
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2195892 word vectors.
CPU times: user 2min 35s, sys: 4.46 s, total: 2min 40s
Wall time: 2min 40s


In [8]:
EMBEDDING_DIM = 300

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [9]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

# Setup model




In [10]:
%%time

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

# train a 1D convnet with global maxpooling
inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(inp)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inp, out)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         58857900  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
__________

In [11]:
%%time 

from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = BatchNormalization()(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, use_bias=False)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)


model = Model(inp, out)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         58857900  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 128)         140544    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000, 128)         512       
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2048      
_________________________________________________________________
batch_normalization_2 (Batch (None, 16)                64        
__________

# Compile the model

In [12]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Setup f1-score, precision and recall metrics

In [13]:
'''
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support

class Metrics(Callback):


    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
 

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average='binary')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()
'''

'\nfrom keras.callbacks import Callback\nfrom sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support\n\nclass Metrics(Callback):\n\n\n    def on_train_begin(self, logs={}):\n        self.val_f1s = []\n        self.val_recalls = []\n        self.val_precisions = []\n \n\n    def on_epoch_end(self, epoch, logs={}):\n        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()\n        val_targ = self.validation_data[1]\n        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average=\'binary\')\n        self.val_f1s.append(_val_f1)\n        self.val_recalls.append(_val_recall)\n        self.val_precisions.append(_val_precision)\n        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))\n        return\n\nmetrics = Metrics()\n'

# Compute Class Weights

Since there is a significant target inbalance.

In [14]:
'''
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
'''

"\nfrom sklearn.utils import class_weight\n\nclass_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)\n"

# Train the Model

Use model checkpointing to save the model that attains the best validation loss.

In [None]:
%%time 
#model.fit(X_train, y_train, validation_data=(X_val, y_val),
#          epochs=2, batch_size=128, callbacks=[metrics], class_weight=class_weights)

model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=2, batch_size=1024)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2
  75776/1044897 [=>............................] - ETA: 5:40 - loss: 0.6485 - acc: 0.7713

# Predict validation labels

In [None]:
%%time

pred_val = model.predict([X_val], batch_size=1024, verbose=1)


# Find best threshold

In [None]:
def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)


# Predict test labels

In [None]:
%%time

pred_test = model.predict([X_test], batch_size=1024, verbose=1)


# Prepare submission

In [None]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)