# Import dataset in to dataframe

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

# Analyze train and test data

In [2]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
837289,a4146bdfc11262fd5613,Do you think gay men should become urologists? Are you okay with a gay man being your urologist?,1
14835,02eab2c7dfb526914243,"Can Modiji restrict his ""Achhe din "" only for the BJP supporters? We, the normal people, cannot handle this much happiness.",1
379422,4a5fd6fa26f394d2f258,Why is almost everything we do or want to do illegal in the US?,1
357581,461736c3c3c8ecf56d35,Is becoming castrated a lesson that a teenage boy never forgets until his death?,1
676290,846f3cab27480795d9b8,How is the conviction of Bill Cosby at all legal? I could have sworn I read/heard/saw that his confessed deposition was given after being promised somewhat of an immunity. Was that false? What proof had they to convict him outside that statement?,1
529864,67bf53cc3d2ae03b5d59,Why north Indian girls are not that cute and innocent as compare to North eastern girls?,1
1230344,f11a60842da1493abdfe,Why did my dick fall off twice?,1
687458,86a5f16e1b9e81a7806c,"Putting my hypothetical (and somewhat sarcastic) Evangelical hat on, could all of these catastrophic hurricanes in the South and fires in the West be God's wrath on the USA for electing Trump?",1
488611,5fb1b4ac3ccf62922fed,Why is it so easy to thrash doctors?,1
974161,bed8bd1762ceb4c3fb0d,What makes this generation is so ignorant and had a rebellious mind?,1


In [3]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
766074,9614c52a3cf334f738f3,Do girls like when guys talk about their emotions with them?,0
476657,5d57e19cd42ea34c8ad6,Why do I only get super frusterated by one video game? Nothing frustrates me easily and other games I’m fine with.,0
27915,057640ab543f37f29180,Why exactly is it not a good idea to hook up with a guy who has a partner who doesn't know about his hooking up?,0
1043664,cc80600853aec9de15ad,Which historical swords do you think should get more attention?,0
486084,5f3186883b0eed31e4fd,How do I ask questions effectively on social platforms?,0
536728,69206192b16aba503e0f,Is it okay to wear junior clothes for women in 30’s?,0
403641,4f179c5b510d101d76ee,Do I have to wear panties everyday?,0
736504,903da28e95eaff3995b5,What is the most worthwhile thing to do in life?,0
1096210,d6d986e353419d8e8a33,Can love happen at 14?,0
621790,79c08485b90835f11d79,What is the best career path for someone who likes independent work with little meetings and high pay? Don’t say software engineering because I tried that and didn’t like it.,0


In [4]:
import numpy as np


target_ratios = train_df.target.value_counts(normalize=True)

print(target_ratios)

target_ratios.plot(kind='bar', title='Ratios (target)')

0    0.937837
1    0.062163
Name: target, dtype: float64


<matplotlib.axes._subplots.AxesSubplot at 0x7f647f1a99b0>

In [5]:
print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Average word length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x.split())))))

Average word length of questions in train is 13.
Average word length of questions in test is 13.


In [6]:
print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))

Max word length of questions in train is 134.
Max word length of questions in test is 87.


In [7]:
print('Average character length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x)))))
print('Average character length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x)))))

Average character length of questions in train is 71.
Average character length of questions in test is 70.


In [8]:
print('Max character length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x)))))
print('Max character length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x)))))

Max character length of questions in train is 1017.
Max character length of questions in test is 588.


In [9]:
print('p999 character length of questions in train is {0:.0f}.'.format(np.percentile(train_df['question_text'].apply(lambda x: len(x)), 99.9)))
print('p999 character length of questions in test is {0:.0f}.'.format(np.percentile(test_df['question_text'].apply(lambda x: len(x)), 99.9)))

p999 character length of questions in train is 249.
p999 character length of questions in test is 249.


## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [10]:
X_train = train_df['question_text'].fillna('+++').tolist()
X_val = val_df['question_text'].fillna('+++').tolist()
X_test = test_df['question_text'].fillna('+++').tolist()

y_train = train_df['target']
y_val = val_df['target']

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300
MAX_WORDS = 100000

tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Shape of X_train: (1044897, 250)
Shape of y_train: (1044897,)
Found 196192 unique tokens.


# Setup Embedding layer

In [12]:
from gensim.models import KeyedVectors
import numpy as np
import os

def loadEmbeddings(path, dimensions, mode='r', encoding=None, errors=None):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    f = open(path, buffering=((2<<16) + 8), mode=mode, encoding=encoding, errors=errors)
    for line in f:
        values = line.split()
        word = ''.join(values[:-dimensions])
        coefs = np.asarray(values[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

def loadEmbeddingsGensim(path, dimensions, binary=True):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    gensim_vecs = KeyedVectors.load_word2vec_format(path, binary=binary)
    for word, vector in zip(gensim_vecs.vocab, gensim_vecs.vectors):
        coefs = np.asarray(vector[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    print('Found %s word vectors.' % len(embeddings))
    return embeddings
    

In [13]:
def getEmbeddingMatrix(embedding, word_index, dimensions):
    embedding_matrix = np.zeros((len(word_index) + 1, dimensions))
    for word, i in word_index.items():
        embedding_vector = embedding.get(word)
        if embedding_vector is not None:
            if i >= MAX_WORDS:
                continue
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [14]:
from keras.layers import Embedding

embedding_layers = {}

In [None]:
glove_path = os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
embeddings_index = loadEmbeddings(glove_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['glove'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/glove.840B.300d/glove.840B.300d.txt


In [None]:
paragram_path = os.path.join('..', 'input', 'embeddings', 'paragram_300_sl999', 'paragram_300_sl999.txt')
embeddings_index = loadEmbeddings(paragram_path, EMBEDDING_DIM, encoding='utf8', errors='ignore')
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['paragram'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [None]:
wiki_path = os.path.join('..', 'input', 'embeddings', 'wiki-news-300d-1M', 'wiki-news-300d-1M.vec')
embeddings_index = loadEmbeddings(wiki_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['wiki'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [None]:
google_news_path = os.path.join('..', 'input', 'embeddings', 'GoogleNews-vectors-negative300', 'GoogleNews-vectors-negative300.bin')
embeddings_index = loadEmbeddingsGensim(google_news_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
embedding_layers['google_news'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [None]:
del word_index

# Setup model

In [None]:
from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.layers import SpatialDropout1D, Concatenate, Flatten, Reshape
from keras.regularizers import l2
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

a = embedding_layers['glove'] (inp)
a = Bidirectional(CuDNNGRU(64, return_sequences=True))(a)
a = GlobalMaxPool1D()(a)
a = Dense(16, use_bias=False)(a)
a = BatchNormalization()(a)
a = Activation("relu")(a)
a = Dropout(0.1)(a)

b = embedding_layers['paragram'] (inp)
b = Bidirectional(CuDNNGRU(64, return_sequences=True))(b)
b = GlobalMaxPool1D()(b)
b = Dense(16, use_bias=False)(b)
b = BatchNormalization()(b)
b = Activation("relu")(b)
b = Dropout(0.1)(b)

c = embedding_layers['wiki'] (inp)
c = Bidirectional(CuDNNGRU(64, return_sequences=True))(c)
c = GlobalMaxPool1D()(c)
c = Dense(16, use_bias=False)(c)
c = BatchNormalization()(c)
c = Activation("relu")(c)
c = Dropout(0.1)(c)

d = embedding_layers['google_news'] (inp)
d = Bidirectional(CuDNNGRU(64, return_sequences=True))(d)
d = GlobalMaxPool1D()(d)
d = Dense(16, use_bias=False)(d)
d = BatchNormalization()(d)
d = Activation("relu")(d)
d = Dropout(0.1)(d)

x = Concatenate(axis=1)([a, b, c, d])
x = Dense(32, use_bias=False)(x)
x = BatchNormalization()(x)
#kernel_regularizer=l2(0.01)
x = Activation("relu")(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)

model = Model(inp, out)

print(model.summary())

# Compile the model

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model

Use model checkpointing to save the model that attains the best validation loss.

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=2, batch_size=512)

# Predict validation labels

In [None]:
pred_val = model.predict([X_val], batch_size=1024, verbose=1)

# Find best threshold

In [None]:
from sklearn.metrics import f1_score

def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)

# Predict test labels

In [None]:
pred_test = model.predict([X_test], batch_size=1024, verbose=1)

# Prepare submission

In [None]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)