In [1]:
%load_ext memory_profiler

# Import dataset in to dataframe

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = -1

# Analyze train and test data

In [3]:
print("Sample insincere questions")
train_df.loc[train_df['target'] == 1].sample(10)

Sample insincere questions


Unnamed: 0,qid,question_text,target
321903,3f161f8b7be57098b0b6,Why can women never decide what or where they want to eat?,1
882246,acd78680501fc908040f,Why don't the Turks join the Arabs? How can they turn away from their only brothers?,1
326259,3ff43a2ee7be50e25114,I am so fucking ugly even you wouldn't date me and I know it? Yo,1
33314,06860ff88c0d7c76fd0a,What do pro-choice people mean when they talk about the woman's body?,1
73556,0e69b32e3da3314aae53,What would happen if a city or an entire country made humans illegal?,1
396719,4db993a4950d1c3476ff,How do you stop the misinformation spread by pseudo Hindu intellectuals on Quora?,1
945843,b9595cb9d4ef93bd69da,How can I send all Kashmiris to Pakistan? Can Modiji help me?,1
608883,773aa560c78f08fc3de5,Why did Obama wiretap Trump Tower?,1
566993,6f16de4d8ed13b7f07bf,Have people become paranoid and prejudice of us autists after what happened with Adam Lanza and Elliot Rodger? Have these two given us autists a bad name? Do people demonize autists because of what these two individuals did especially?,1
1062284,d028778d83046dd07143,What does it mean if my aunt comes up to my room and sits on me when I'm laying down and one time she started sucking my finger what does this mean?,1


In [4]:
print("Sample sincere questions")
train_df.loc[train_df['target'] == 0].sample(10)

Sample sincere questions


Unnamed: 0,qid,question_text,target
478067,5d9d4b740fef6116975b,Are we here to serve the state/nation or is it here to serve us?,0
84140,107934b919f1b36cd006,How can I Fix Opera Web Browser Error 72?,0
441593,568a78bdaa0eb1cfc80a,What was your best experience while having sex?,0
1265734,f80f10f698d0f44f88a2,Is there an open source SD-WAN software package that creates a private VPN network like VeloCloud or Versa?,0
855810,a7acb495015fabc9e926,What are the most unforgettable bars/pubs in fiction?,0
680343,853cd314547213fed0b7,Can I install a different desktop environment in Windows?,0
476604,5d551c56c18a1391abb5,Is Disney making a conscious decision to cast a British brunette woman in every new Star Wars movie?,0
894306,af3820b92148be03b175,What is the meaning of ensign in the navy?,0
87524,112779b5a0228f3fd383,What is the meaning of right ventricle dilation? What causes it and what is the treatment?,0
144263,1c3d7fe77201fdd1a8b1,How does one revert themselves back into being a baby?,0


In [5]:
import numpy as np


target_ratios = train_df.target.value_counts(normalize=True)

print(target_ratios)

target_ratios.plot(kind='bar', title='Ratios (target)')

0    0.937837
1    0.062163
Name: target, dtype: float64


<matplotlib.axes._subplots.AxesSubplot at 0x7ff6f224c5f8>

In [6]:
print('Average word length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Average word length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x.split())))))

Average word length of questions in train is 13.
Average word length of questions in test is 13.


In [7]:
print('Max word length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x.split())))))
print('Max word length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x.split())))))

Max word length of questions in train is 134.
Max word length of questions in test is 87.


In [8]:
print('Average character length of questions in train is {0:.0f}.'.format(np.mean(train_df['question_text'].apply(lambda x: len(x)))))
print('Average character length of questions in test is {0:.0f}.'.format(np.mean(test_df['question_text'].apply(lambda x: len(x)))))

Average character length of questions in train is 71.
Average character length of questions in test is 70.


In [9]:
print('Max character length of questions in train is {0:.0f}.'.format(np.max(train_df['question_text'].apply(lambda x: len(x)))))
print('Max character length of questions in test is {0:.0f}.'.format(np.max(test_df['question_text'].apply(lambda x: len(x)))))

Max character length of questions in train is 1017.
Max character length of questions in test is 588.


In [10]:
print('p999 character length of questions in train is {0:.0f}.'.format(np.percentile(train_df['question_text'].apply(lambda x: len(x)), 99.9)))
print('p999 character length of questions in test is {0:.0f}.'.format(np.percentile(test_df['question_text'].apply(lambda x: len(x)), 99.9)))

p999 character length of questions in train is 249.
p999 character length of questions in test is 249.


## **Preparing the text data**

First, we will iterate over the text questions are stored, and format them into a list.

In [11]:
X_train = train_df['question_text'].fillna('+++').tolist()
X_val = val_df['question_text'].fillna('+++').tolist()
X_test = test_df['question_text'].fillna('+++').tolist()

y_train = train_df['target']
y_val = val_df['target']

print('Found %s training questions.' % len(X_train))
print('Found %s validation questions.' % len(X_val))
print('Found %s test questions.' % len(X_test))

Found 1044897 training questions.
Found 261225 validation questions.
Found 56370 test questions.


In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300
MAX_WORDS = 100000

tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True, split=' ', 
                       char_level=False, oov_token=None, document_count=0,
                      )
                                   
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Shape of X_train: (1044897, 250)
Shape of y_train: (1044897,)
Found 196192 unique tokens.


In [13]:
%%time

import pickle

test_df.to_pickle('test_df.pkl')

np.save('X_train.npy', X_train)
np.save('X_val.npy', X_val)
np.save('X_test.npy', X_test)

y_train.to_pickle('y_train.pkl')
y_val.to_pickle('y_val.pkl')

pickle.dump(word_index, open('word_index.pkl', 'wb'))

CPU times: user 88 ms, sys: 1.18 s, total: 1.26 s
Wall time: 6.95 s


# Oversample training data

In [None]:
'''
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42, ratio = 1.0)
X_train, y_train = sm.fit_sample(X_train, y_train)
'''


In [None]:
'''
balanced_train_df = pd.DataFrame()
balanced_train_df[0] = np.array(y_train)

target_ratios = balanced_train_df[0].value_counts(normalize=True)

print(target_ratios)

target_ratios.plot(kind='bar', title='Ratios after SMOTE (target)')
'''

# Setup Embedding layer

In [15]:
from gensim.models import KeyedVectors
import numpy as np
import os

def loadEmbeddings(path, dimensions, mode='r', encoding=None, errors=None):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    f = open(path, buffering=((2<<16) + 8), mode=mode, encoding=encoding, errors=errors)
    for line in f:
        values = line.split()
        word = ''.join(values[:-dimensions])
        coefs = np.asarray(values[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

def loadEmbeddingsGensim(path, dimensions, binary=True):
    print('Loading embeddings from: %s' %path)
    embeddings = {}
    gensim_vecs = KeyedVectors.load_word2vec_format(path, binary=binary)
    for word, vector in zip(gensim_vecs.vocab, gensim_vecs.vectors):
        coefs = np.asarray(vector[-dimensions:], dtype='float32')
        embeddings[word] = coefs
    print('Found %s word vectors.' % len(embeddings))
    return embeddings
    

In [16]:
def getEmbeddingMatrix(embedding, word_index, dimensions):
    embedding_matrix = np.zeros((len(word_index) + 1, dimensions))
    for word, i in word_index.items():
        embedding_vector = embedding.get(word)
        if embedding_vector is not None:
            if i >= MAX_WORDS:
                continue
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [29]:
from keras.layers import Embedding

embedding_layers = {}

In [18]:
glove_path = os.path.join('..', 'input', 'embeddings', 'glove.840B.300d', 'glove.840B.300d.txt')
embeddings_index = loadEmbeddings(glove_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
np.save('glove.npy', embedding_matrix)
embedding_layers['glove'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/glove.840B.300d/glove.840B.300d.txt
Found 2195892 word vectors.


In [19]:
paragram_path = os.path.join('..', 'input', 'embeddings', 'paragram_300_sl999', 'paragram_300_sl999.txt')
embeddings_index = loadEmbeddings(paragram_path, EMBEDDING_DIM, encoding='utf8', errors='ignore')
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
np.save('paragram.npy', embedding_matrix)
embedding_layers['paragram'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt
Found 1703663 word vectors.


In [20]:
wiki_path = os.path.join('..', 'input', 'embeddings', 'wiki-news-300d-1M', 'wiki-news-300d-1M.vec')
embeddings_index = loadEmbeddings(wiki_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
np.save('wiki.npy', embedding_matrix)
embedding_layers['wiki'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
Found 999995 word vectors.


In [21]:
google_news_path = os.path.join('..', 'input', 'embeddings', 'GoogleNews-vectors-negative300', 'GoogleNews-vectors-negative300.bin')
embeddings_index = loadEmbeddingsGensim(google_news_path, EMBEDDING_DIM)
embedding_matrix = getEmbeddingMatrix(embeddings_index, word_index, EMBEDDING_DIM)
del embeddings_index
np.save('google_news.npy', embedding_matrix)
embedding_layers['google_news'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

Loading embeddings from: ../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
Found 3000000 word vectors.


In [26]:
%%time

import numpy as np
import pandas as pd
import pickle

test_df = pd.read_pickle('test_df.pkl')

X_train = np.load('X_train.npy')
X_val = np.load('X_val.npy')
X_test = np.load('X_test.npy')

y_train = pd.read_pickle('y_train.pkl')
y_val = pd.read_pickle('y_val.pkl')

word_index = pickle.load(open('word_index.pkl', 'rb'))

CPU times: user 76 ms, sys: 1.11 s, total: 1.19 s
Wall time: 1.18 s


In [30]:
%%time

embedding_matrix = np.load('glove.npy')
embedding_layers['glove'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

embedding_matrix = np.load('paragram.npy')
embedding_layers['paragram'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

embedding_matrix = np.load('wiki.npy')
embedding_layers['wiki'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

embedding_matrix = np.load('google_news.npy')
embedding_layers['google_news'] = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)


CPU times: user 44 ms, sys: 1.54 s, total: 1.58 s
Wall time: 7.03 s


In [31]:
del word_index

# Setup model

In [32]:
%%time 

from keras.layers import Dense, Dropout, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.layers import Activation, BatchNormalization, CuDNNGRU
from keras.layers import SpatialDropout1D, Concatenate, Flatten, Reshape
from keras.regularizers import l2
from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

a = embedding_layers['glove'] (inp)
a = Bidirectional(CuDNNGRU(64, return_sequences=True))(a)
a = GlobalMaxPool1D()(a)
a = Dense(16, use_bias=False)(a)
a = BatchNormalization()(a)
a = Activation("relu")(a)
a = Dropout(0.1)(a)

b = embedding_layers['paragram'] (inp)
b = Bidirectional(CuDNNGRU(64, return_sequences=True))(b)
b = GlobalMaxPool1D()(b)
b = Dense(16, use_bias=False)(b)
b = BatchNormalization()(b)
b = Activation("relu")(b)
b = Dropout(0.1)(b)

c = embedding_layers['wiki'] (inp)
c = Bidirectional(CuDNNGRU(64, return_sequences=True))(c)
c = GlobalMaxPool1D()(c)
c = Dense(16, use_bias=False)(c)
c = BatchNormalization()(c)
c = Activation("relu")(c)
c = Dropout(0.1)(c)

d = embedding_layers['google_news'] (inp)
d = Bidirectional(CuDNNGRU(64, return_sequences=True))(d)
d = GlobalMaxPool1D()(d)
d = Dense(16, use_bias=False)(d)
d = BatchNormalization()(d)
d = Activation("relu")(d)
d = Dropout(0.1)(d)

x = Concatenate(axis=1)([a, b, c, d])
x = Dense(32, use_bias=False)(x)
x = BatchNormalization()(x)
#kernel_regularizer=l2(0.01)
x = Activation("relu")(x)
x = Dense(1, use_bias=False)(x)
x = BatchNormalization()(x)
out = Activation("sigmoid")(x)

model = Model(inp, out)

print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 250, 300)     58857900    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 250, 300)     58857900    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 250, 300)     58857900    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_

# Compile the model

In [33]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model

Use model checkpointing to save the model that attains the best validation loss.

In [34]:
%%time 

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=2, batch_size=512)

Train on 1044897 samples, validate on 261225 samples
Epoch 1/2
Epoch 2/2
CPU times: user 13min 12s, sys: 2min 6s, total: 15min 18s
Wall time: 17min 16s


<keras.callbacks.History at 0x7fc9144c4e48>

# serialize model

In [44]:
model.save('model.h5')
print("Saved model to disk")

Saved model to disk


# serialize weights

In [36]:
model.save_weights("weights.h5")
print("Saved model weights to disk")

Saved model weights to disk


# Predict validation labels

In [37]:
%%time

pred_val = model.predict([X_val], batch_size=1024, verbose=1)

CPU times: user 27.6 s, sys: 5.82 s, total: 33.4 s
Wall time: 29.8 s


# Find best threshold

In [38]:
from sklearn.metrics import f1_score

def bestThreshold(y_true,y_pred):
    idx = 0
    cur_f1 = 0
    max_f1 = 0
    thres = 0
    for idx in np.arange(0.1, 0.501, 0.01):
        cur_f1 = f1_score(y_true, np.array(y_pred)> idx)
        if cur_f1 > max_f1:
            max_f1 = cur_f1
            thres = idx
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(thres, max_f1))
    return thres
threshold = bestThreshold(y_val,pred_val)

best threshold is 0.3100 with F1 score: 0.6713


# Predict test labels

In [39]:
%%time

pred_test = model.predict([X_test], batch_size=1024, verbose=1)

CPU times: user 5.68 s, sys: 1.49 s, total: 7.17 s
Wall time: 6.41 s


# Prepare submission

In [40]:
submission_df = pd.DataFrame({"qid":test_df["qid"].values})
submission_df['prediction'] = (pred_test > threshold).astype(int)
submission_df.to_csv("submission.csv", index=False)