In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
tqdm.pandas()
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, CuDNNGRU, MaxPool2D, Conv2D, Concatenate, SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Reshape, Flatten, Dropout, Activation
from keras.engine.topology import Layer

from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
print(os.listdir("../input/"))
print(os.listdir("../input/embeddings/"))

['test.csv', 'train.csv', 'sample_submission.csv', 'embeddings']
['wiki-news-300d-1M', 'glove.840B.300d', 'paragram_300_sl999', 'GoogleNews-vectors-negative300']


In [3]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("train_original shape : ",train.shape)
print("test_original shape : ",test.shape)

train_original shape :  (1306122, 3)
test_original shape :  (375806, 2)


In [4]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


## preprocessing

In [5]:
import re
import string

In [6]:
regular_punct = list(string.punctuation)
extra_punct = [
        ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']

all_punct = list(set(regular_punct + extra_punct))


def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:            
            text = text.replace(punc, f" {punc} ")
            
    return text

In [7]:
def preprocess(text):
    """
    preprocess text main steps
    """
    text = text.lower()
    text = spacing_punctuation(text)
    
    return text

## modeling

In [8]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 150000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 75 # max number of words in a question to use

In [9]:
## preprocess
train['question_text'] = train['question_text'].progress_apply(preprocess)
test['question_text'] = test['question_text'].progress_apply(preprocess)

## split to train and val
train_df, val_df = train_test_split(train, test_size=0.05, random_state=42)

## fill up the missing values
train_X = train_df["question_text"].fillna("_##_").values
val_X = val_df["question_text"].fillna("_##_").values
test_X = test["question_text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)
## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values  
    
#shuffling the data
np.random.seed(42)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx] 

100%|██████████| 1306122/1306122 [00:10<00:00, 119691.92it/s]
100%|██████████| 375806/375806 [00:03<00:00, 118994.02it/s]


In [10]:
# https://www.kaggle.com/shujian/single-rnn-with-5-folds-snapshot-ensemble
def lstm_model():
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size)(input_layer)
    # bidirectional lstm
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True), name='bidirectional_lstm')(x)
    # global_max_pooling1d
    x = GlobalMaxPooling1D()(x)
    x = Dense(32, activation="relu", name = 'dense_1')(x)
    x = Dense(16, activation="relu", name = 'dense_2')(x)
    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model


In [11]:
# https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

def train_pred(model, epochs=2):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    
    
    # Plot training & validation accuracy values
#     plt.plot(model.history['acc'])
#     plt.plot(model.history['val_acc'])
#     plt.title('Model accuracy')
#     plt.ylabel('Accuracy')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
#     plt.show()

#     # Plot training & validation loss values
#     plt.plot(model.history.history['loss'])
#     plt.plot(model.history.history['val_loss'])
#     plt.title('Model loss')
#     plt.ylabel('Loss')
#     plt.xlabel('Epoch')
#     plt.legend(['Train', 'Test'], loc='upper left')
#     plt.show()
    return pred_val_y, pred_test_y

In [12]:
pred_val_y, pred_test_y = train_pred(lstm_model(), epochs = 2) # GloVe only

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1


In [13]:
outputs = []
outputs.append([pred_val_y, pred_test_y, 'LSTM baseline'])

In [14]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.5964787479992887
F1 score at threshold 0.11 is 0.6046087986155387
F1 score at threshold 0.12 is 0.6112610516519311
F1 score at threshold 0.13 is 0.6177642816473715
F1 score at threshold 0.14 is 0.6231729745426388
F1 score at threshold 0.15 is 0.6269567785763512
F1 score at threshold 0.16 is 0.6316
F1 score at threshold 0.17 is 0.6345684039087948
F1 score at threshold 0.18 is 0.6384472434441462
F1 score at threshold 0.19 is 0.6412165705296277
F1 score at threshold 0.2 is 0.6432150090608678
F1 score at threshold 0.21 is 0.6439762290653701
F1 score at threshold 0.22 is 0.6446317171938273
F1 score at threshold 0.23 is 0.6444690265486726
F1 score at threshold 0.24 is 0.6448796866256296
F1 score at threshold 0.25 is 0.6475048093244314
F1 score at threshold 0.26 is 0.6488776912505727
F1 score at threshold 0.27 is 0.6483605607693199
F1 score at threshold 0.28 is 0.6479962502929459
F1 score at threshold 0.29 is 0.6493783303730017
F1 score at threshold 0.3 is 0.647

In [15]:
pred_test_y = (pred_test_y > best_thresh).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)