# Import dataset in to dataframe

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

In [3]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
788833,9a8d713542aead690737,Can you choke yourself to death?,1
1142201,dfd1a42596681c1382ae,Do people in movies use crosses and Bibles against vampires because vampires are allergic to bullshit?,1
1098431,d745f73a22aac68fe4b2,Were Mizrahi Jews allways more literate on average than their Muslim countrymen?,1
233944,2dc0b0870185b0ffb4ac,"Do you think the bottom of an abandoned, about-to-collapse coal mine is a good place for Donald Trump and Kim Jong-un’s meeting?",1
833290,a34bd92ef37c8370c5ba,Are you having flooding issues due to people boohooing and crying because President Trump is kicking so much butt?,1
1129196,dd4c8d9eea3f57f43960,How can we put uppity third world nations back in their place? We need to nip them off in the bud. Look what happened when we let Asians modernize! Do you want Arabs to modernize and take your jobs too?,1
585197,72a545ee0e3c23db90a4,Do white teens know how disrespectful they are when they walk around Thailand bare-chested or half naked like a stupid animal?,1
173207,21df4c62b34f2ebd17f5,"Was Christopher Hitchens a closet Muslim, or is Barack Hussein Obama a closet Muslim?",1
714632,8be36ad5dd53f20781af,"If Americans demand the removal of General Lee's statues under the pretence that he was racist, when he simply fought for his country, why don’t Americans also remove the Democratic party which supported slavery when the Republicans wanted to end it?",1
1012502,c66916643cef19f2598c,"Are non-Trump supporters really just beta males intimidated by his extremely masculine, alpha persona?",1


In [4]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
828269,a252e75d0b3c4f272ae0,How mutation works on evolutioanry strategy uncorrelated mutation with individual step size?,0
33219,0681bc82c01d96554d5a,Why does Robert C. Gallo only think that anything more than a functional cure for HIV can be create? There must be some way to trap/destroy inactive HIV cells?,0
1070113,d1b18c51f912bd92217b,What is the formula of pressure release?,0
1178322,e6e5631cf5c72b993cca,Are you required to stand (not pray) for prayer at a Christian high school?,0
512282,64522e213429ebe02e66,What kind of job gives me happiness? I don't know! How do I find out?,0
1091373,d5e21a34f0cfea113c46,Can projects substitute for lack of work experience?,0
55524,0ae6138e9a1228e3f9ec,What was the estimated population of Babylonia?,0
1236974,f269615fd2491c663dba,"How many females felt that ""beauty is a curse""?",0
251713,31438c0711b1884c94c1,What would happen if Stitches and Clown met?,0
985971,c12975281b3744cea3f8,Why is my puppy very good at responding to commands inside home but is totally oblivious to my voice and commands outside?,0


In [5]:
import numpy as np


target_count = train_df.target.value_counts(normalize=True)

print(target_count)

0    0.937837
1    0.062163
Name: target, dtype: float64


In [6]:
target_count.plot(kind='bar', title='Count (target)');

## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [7]:
X_train = train_df['question_text'].tolist()
X_val = val_df['question_text'].tolist()
X_test = test_df['question_text'].tolist()

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [8]:
%%time 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 300
MAX_WORDS = 50000

tokenizer = Tokenizer(num_words=None, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

y_train = train_df['target']
y_val = val_df['target']

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Shape of X_train: (1044897, 1000)
Shape of y_train: (1044897,)
Found 196192 unique tokens.
CPU times: user 52.9 s, sys: 3.08 s, total: 56 s
Wall time: 56.1 s


# Setup Embedding layer

In [9]:
%%time

import numpy as np
import os

embeddings_index = {}
f = open(os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt'), buffering=((2<<16) + 8))
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2195892 word vectors.
CPU times: user 2min 20s, sys: 3.89 s, total: 2min 24s
Wall time: 2min 24s


In [10]:
EMBEDDING_DIM = 300

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

# Setup model




In [12]:
%%time

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

# train a 1D convnet with global maxpooling
inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(inp)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
out = Dense(1, activation='sigmoid')(x)

model = Model(inp, out)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         58857900  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          192128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048     
__________

In [13]:
%%time 

from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
x = embedding_layer(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = BatchNormalization()(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, use_bias=False)(x)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)


model = Model(inp, out)

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 300)         58857900  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1000, 128)         140544    
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000, 128)         512       
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2048      
_________________________________________________________________
batch_normalization_2 (Batch (None, 16)                64        
__________

# Compile the model

In [14]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Setup f1-score, precision and recall metrics

In [15]:
'''
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support

class Metrics(Callback):


    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []
 

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average='binary')
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))
        return

metrics = Metrics()
'''

'\nfrom keras.callbacks import Callback\nfrom sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, precision_recall_fscore_support\n\nclass Metrics(Callback):\n\n\n    def on_train_begin(self, logs={}):\n        self.val_f1s = []\n        self.val_recalls = []\n        self.val_precisions = []\n \n\n    def on_epoch_end(self, epoch, logs={}):\n        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()\n        val_targ = self.validation_data[1]\n        _val_precision, _val_recall, _val_f1, _ = precision_recall_fscore_support(val_targ, val_predict, average=\'binary\')\n        self.val_f1s.append(_val_f1)\n        self.val_recalls.append(_val_recall)\n        self.val_precisions.append(_val_precision)\n        print(" — val_f1: %f — val_precision: %f — val_recall: %f" %(_val_f1, _val_precision, _val_recall))\n        return\n\nmetrics = Metrics()\n'

# Compute Class Weights

Since there is a significant target inbalance.

In [16]:
'''
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
'''

"\nfrom sklearn.utils import class_weight\n\nclass_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)\n"

# Train the Model

Use model checkpointing to save the model that attains the best validation loss.

In [None]:
%%time 
#model.fit(X_train, y_train, validation_data=(X_val, y_val),
#          epochs=2, batch_size=128, callbacks=[metrics], class_weight=class_weights)

model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=2, batch_size=1024)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2

# Predict validation labels

In [None]:
%%time

pred_val = model.predict([X_val], batch_size=1024, verbose=1)


# Find best threshold

In [None]:
from sklearn.metrics import f1_score

def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)


# Predict test labels

In [None]:
%%time

pred_test = model.predict([X_test], batch_size=1024, verbose=1)


# Prepare submission

In [None]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)