# Import dataset in to dataframe

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

# Analyze train and test data

In [2]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
1247862,f48cf262906b4ec1e99b,Do Chinese feel ashamed about their country and want to leave China the way an increasing number of Indians these days do with respect to India?,1
136091,1aa45bef4748da59ce9c,"If Campbell claims that the Russians provided tremendous support to Hillary, how can it be explained that Rosenstein believes the Russians support Trump?",1
995520,c31636dad39546e27427,"Why do girls wear skirts and shorts in public area? Obviously to look attractive, then why they have problem when someone stares them!",1
608299,771bef59843becc6773d,How do I get over my fear of black men?,1
980709,c01d87981ce02afa8d83,Is there any way to get a Muslim cleric declare that Palestinians are fake Muslims?,1
1187630,e8bbdae1cb950f38979a,I have an intense desire to shoot church choir and alter girls because they show extremely disgusting looks to me?,1
602731,760b662bf3bb2a676444,Will Islam ever get banned from Europe? We need to make it happen!,1
319996,3eb5a0354a76d175bd31,Why are punjabis so aggressive? Why do they get offended so easily?,1
683334,85d5b10b163b8adee8df,Why do guys act so innocent at first then suddenly change and want sex?,1
824518,a1938e6145b97e7921bf,"The path to atheism, is it mind corruption or mental sanitation? Should ""new discoveries"" trouble the mind?",1


In [3]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
1044391,cca59fa8481fc990ab7c,"My business has $85,000 in yearly contracts and about $165,000 in yearly sales. How would I valuate this?",0
530479,67dba3131d4a707eef39,How strong was the average Roman soldier in the Roman Empire?,0
555050,6cc0d8f92cab3f66f7ee,"How would you feel or act toward a child if you found out that he knew that his dad hid $350K from you during the divorce - while you are struggling (really hard) right now, to get by while his dad is floating in money?",0
177787,22c0105412ad0d501e1c,"How much ""Political Migration"" from one State to another happens in the US, as in a ""Liberal"" leaving a conservative state for one more representative of a persons political belief or the other way around?",0
1248780,f4b92aa8f7ef59372f2d,What are the best things for University of Calcutta students to do on weekends?,0
543151,6a6a605edb5814029877,How can you make a traditional chipotle recipe into a low calorie recipe doing minor alterations?,0
603791,763ed55f2d93ead2bf3e,What Should drivers do to protesters blocking the road?,0
922683,b4cfa7855c2b8d813335,Which companies are present used the asp.net?,0
109754,157c8e3ff2f5c6fea56a,What if I wanted to play a professional sport and have a working job (like an engineer) when I grow up? Is it possible?,0
1154620,e23ed77124fae386380a,What are the origins of monkeys?,0


In [4]:
import numpy as np


target_ratios = train_df.target.value_counts(normalize=True)

print(target_ratios)

target_ratios.plot(kind='bar', title='Ratios (target)')

0    0.937837
1    0.062163
Name: target, dtype: float64


<matplotlib.axes._subplots.AxesSubplot at 0x7f70ba9e9908>

In [5]:
print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Average word length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x.split())))))

Average word length of questions in train is 13.
Average word length of questions in test is 13.


In [6]:
print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))

Max word length of questions in train is 134.
Max word length of questions in test is 87.


In [7]:
print('Average character length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x)))))
print('Average character length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x)))))

Average character length of questions in train is 71.
Average character length of questions in test is 70.


In [8]:
print('Max character length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x)))))
print('Max character length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x)))))

Max character length of questions in train is 1017.
Max character length of questions in test is 588.


In [9]:
print('p999 character length of questions in train is {0:.0f}.'.format(np.percentile(train_df['question_text'].apply(lambda x: len(x)), 99.9)))
print('p999 character length of questions in test is {0:.0f}.'.format(np.percentile(test_df['question_text'].apply(lambda x: len(x)), 99.9)))

p999 character length of questions in train is 249.
p999 character length of questions in test is 249.


## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [10]:
X_train = train_df['question_text'].fillna('+++').tolist()
X_val = val_df['question_text'].fillna('+++').tolist()
X_test = test_df['question_text'].fillna('+++').tolist()

y_train = train_df['target']
y_val = val_df['target']

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300
MAX_WORDS = 100000

tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Shape of X_train: (1044897, 250)
Shape of y_train: (1044897,)
Found 196192 unique tokens.


# Setup Embedding layer

In [12]:
from gensim.models import KeyedVectors
import numpy as np
import os

def loadEmbeddings(path, dimensions, mode='r', encoding=None, errors=None):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    f = open(path, buffering=((2<<16) + 8), mode=mode, encoding=encoding, errors=errors)
    for line in f:
        values = line.split()
        word = ''.join(values[:-dimensions])
        coefs = np.asarray(values[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

def loadEmbeddingsGensim(path, dimensions, binary=True):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    gensim_vecs = KeyedVectors.load_word2vec_format(path, binary=binary)
    for word, vector in zip(gensim_vecs.vocab, gensim_vecs.vectors):
        coefs = np.asarray(vector[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    print('Found %s word vectors.' % len(embeddings))
    return embeddings
    

In [13]:
def getEmbeddingMatrix(embedding, word_index, dimensions):
    embedding_matrix = np.zeros((len(word_index) + 1, dimensions))
    for word, i in word_index.items():
        embedding_vector = embedding.get(word)
        if embedding_vector is not None:
            if i >= MAX_WORDS:
                continue
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [14]:
from keras.layers import Embedding

embedding_layers = {}

In [15]:
glove_path = os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
embeddings_index = loadEmbeddings(glove_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['glove'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/glove.840B.300d/glove.840B.300d.txt
Found 2195892 word vectors.


In [16]:
paragram_path = os.path.join('..', 'input', 'embeddings', 'paragram_300_sl999', 'paragram_300_sl999.txt')
embeddings_index = loadEmbeddings(paragram_path, EMBEDDING_DIM, encoding='utf8', errors='ignore')
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['paragram'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt
Found 1703663 word vectors.


In [17]:
wiki_path = os.path.join('..', 'input', 'embeddings', 'wiki-news-300d-1M', 'wiki-news-300d-1M.vec')
embeddings_index = loadEmbeddings(wiki_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['wiki'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
Found 999995 word vectors.


In [18]:
google_news_path = os.path.join('..', 'input', 'embeddings', 'GoogleNews-vectors-negative300', 'GoogleNews-vectors-negative300.bin')
embeddings_index = loadEmbeddingsGensim(google_news_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['google_news'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
Found 3000000 word vectors.


In [19]:
del word_index

# Setup model

In [20]:
from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.layers import SpatialDropout1D, Concatenate, Flatten, Reshape
from keras.regularizers import l2
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

a = embedding_layers['glove'] (inp)
a = Bidirectional(CuDNNGRU(64, return_sequences=True))(a)
a = GlobalMaxPool1D()(a)
a = Dense(16, use_bias=False)(a)
a = BatchNormalization()(a)
a = Activation("relu")(a)
a = Dropout(0.1)(a)

b = embedding_layers['paragram'] (inp)
b = Bidirectional(CuDNNGRU(64, return_sequences=True))(b)
b = GlobalMaxPool1D()(b)
b = Dense(16, use_bias=False)(b)
b = BatchNormalization()(b)
b = Activation("relu")(b)
b = Dropout(0.1)(b)

c = embedding_layers['wiki'] (inp)
c = Bidirectional(CuDNNGRU(64, return_sequences=True))(c)
c = GlobalMaxPool1D()(c)
c = Dense(16, use_bias=False)(c)
c = BatchNormalization()(c)
c = Activation("relu")(c)
c = Dropout(0.1)(c)

d = embedding_layers['google_news'] (inp)
d = Bidirectional(CuDNNGRU(64, return_sequences=True))(d)
d = GlobalMaxPool1D()(d)
d = Dense(16, use_bias=False)(d)
d = BatchNormalization()(d)
d = Activation("relu")(d)
d = Dropout(0.1)(d)

x = Concatenate(axis=1)([a, b, c, d])
x = Dense(32, use_bias=False)(x)
x = BatchNormalization()(x)
#kernel_regularizer=l2(0.01)
x = Activation("relu")(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)

model = Model(inp, out)

print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 300)     58857900    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 300)     58857900    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 250, 300)     58857900    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_

# Compile the model

In [21]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model

Use model checkpointing to save the model that attains the best validation loss.

In [22]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=4, batch_size=1024)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f704a23d518>

# Predict validation labels

In [23]:
pred_val = model.predict([X_val], batch_size=1024, verbose=1)



# Find best threshold

In [24]:
from sklearn.metrics import f1_score

def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)

best threshold is 0.3600 with F1 score: 0.6699


# Predict test labels

In [25]:
pred_test = model.predict([X_test], batch_size=1024, verbose=1)



# Prepare submission

In [26]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)