In [1]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
tqdm.pandas()
import math
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, CuDNNGRU, MaxPool2D, Conv2D, Concatenate, SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Reshape, Flatten, Dropout, Activation
from keras.layers import Activation, Wrapper
from keras.engine.topology import Layer
from keras import backend as K

from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


Using TensorFlow backend.


In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("train shape : ", train.shape)
print("test shape : ", test.shape)

train shape :  (1306122, 3)
test shape :  (375806, 2)


In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
def load_word_embedding(filepath):
    """
    Given a filepath to embeddings file, return a word to vec dictionary, 
    in other words, word_embedding
    Ex. {'word': array([1.97, -0.63, ..., 0.573, 2.54])}
    """
    def _get_vec(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    print('load word embedding ......')
    
    try:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(filepath))
    except UnicodeDecodeError:
        word_embedding = dict(_get_vec(*w.split(' ')) for w in open(filepath, encoding="utf8", errors='ignore'))
    
    # sanity check for word vectors must with lengths of 300
    words_to_del = []
    for word, vec in word_embedding.items():
        if len(vec) != 300:
            words_to_del.append(word)
    for word in words_to_del:
        del word_embedding[word]
    return word_embedding

In [5]:
EMBEDDING_FILE_glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
embeddings_index_glove = load_word_embedding(EMBEDDING_FILE_glove)

load word embedding ......


## preprocess

In [6]:
import re
import string

In [7]:
regular_punct = list(string.punctuation)
extra_punct = [
        ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']

all_punct = list(set(regular_punct + extra_punct))

def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:            
            text = text.replace(punc, f" {punc} ")
            
    return text

In [8]:
def clean_misspell(text):
    """
    misspell list (quora vs. glove)
    """
    
    mispell_dict = {'demonitisation': 'demonetization', 'demonitization': 'demonetization', 
                    'demonetisation': 'demonetization',
                    'pokémon': 'pokemon',
                    'Wjy': 'Why',
                    'Whst' : 'What',
                    'BNBR': 'Be Nice Be Respectful',
                    'Bolsonaro': 'Jair Bolsonaro',
                    'XXXTentacion': 'Tentacion',
                    'Žižek': 'Slovenian philosopher Slavoj Žižek',
                    'Adityanath': 'Indian monk Yogi Adityanath',
                    'Brexit': 'British Exit',
                    'Brexiter': 'British Exit supporter',
                    'Brexiters': 'British Exit supporters',
                    'Brexiteer': 'British Exit supporter',
                    'Brexiteers': 'British Exit supporters',
                    'Brexiting': 'British Exit',
                    'Brexitosis': 'British Exit disorder',
                    'brexit': 'British Exit',
                    'brexiters': 'British Exit supporters',
                    'cryptocurrencies': 'cryptocurrency',
                    'Cryptocurrency': 'cryptocurrency',
                    'Litecoin' : 'cryptocurrency',
                    'litecoin' : 'cryptocurrency',
                    'altcoin' : 'cryptocurrency',
                    'altcoins' : 'cryptocurrency',
                    'jallikattu': 'Jallikattu',
                    'Swachh': 'Swachh Bharat mission campaign ',
                    'SJWs': 'social justice warrior',
                    'Quorans': 'Quoran',
                    'Qoura': 'Quora',
                    'quoras': 'Quora',
                    'Quroa': 'Quora',
                    'QUORA': 'Quora',
                    'Qoura': 'Quora',
                    'narcissit': 'narcissist',
                    'ethereum': 'Ethereum',
                    'Blockchain': 'blockchain',
                    'UCEED': 'Undergraduate Common Entrance Examination for Design',
                    'GDPR': 'General Data Protection Regulation',
                    'Redmi' : 'Xiaomi smartphone',
                    'OnePlus': 'Android smartphone',
                    'Machedo' : 'hot guy',
                    'Coinbase':'bitcoin broker',
                    'coinbase':'bitcoin broker',
                    'DCEU' : 'American media franchise',
                    'IIEST': 'Indian Institutes of Engineering Science and Technology',
                    'Upwork' : 'global freelancing platform',
                    'upwork' : 'global freelancing platform',
                    'HackerRank' : 'technology company focuses on competitive programming challenges',
                    'pokémon': 'pokemon'}

    misspell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))

    def _replace(match):
        """
        reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa
        """
        try:
            word = mispell_dict.get(match.group(0))
        except KeyError:
            word = match.group(0)
            print('!!Error: Could Not Find Key: {}'.format(word))
        return word
    return misspell_re.sub(_replace, text)

In [9]:
def preprocess(text):
    """
    preprocess text main steps

    """
    text = spacing_punctuation(text)
    text = clean_misspell(text)
   
    return text

In [10]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 150000 # 150k to 180k how many unique words to use (i.e num rows in embedding vector)
maxlen = 75 # max number of words in a question to use

## tokenize text

In [11]:
## preprocess
train['question_text'] = train['question_text'].progress_apply(preprocess)
test['question_text'] = test['question_text'].progress_apply(preprocess)

## split to train and val
train_df, val_df = train_test_split(train, test_size=0.05, random_state=42)

## fill up the missing values
train_X = train_df["question_text"].fillna("_##_").values
val_X = val_df["question_text"].fillna("_##_").values
test_X = test["question_text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features) # glove

tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values  

#shuffling the data
np.random.seed(42)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx] 

## word index dictionary from input text/document
word_index = tokenizer.word_index

100%|██████████| 1306122/1306122 [00:25<00:00, 50824.33it/s]
100%|██████████| 375806/375806 [00:07<00:00, 51636.13it/s]


## creating embedding matrix

In [12]:
def create_embedding_weights(word_index, embeddings_index, max_features):
    '''
    input: word_index from training set and embedding from embedding file
    output: matrix with matched embedding arrays
    '''
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    nb_words = min(max_features, len(word_index))
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    # matching the word and updating the corresponding embedding vector  
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [13]:
embedding_matrix = create_embedding_weights(word_index, embeddings_index_glove, max_features)

  import sys


## train different models

In [14]:
def lstm_model(embedding_matrix):
    
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(input_layer)
    x = SpatialDropout1D(0.2)(x)
    # bidirectional lstm
    x1 = Bidirectional(CuDNNLSTM(128, return_sequences=True), name='bidirectional_lstm')(x)
    # bidirectional gru
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True), name='bidirectional_gru')(x1)
    # attention
    #atten = Attention(step_dim=maxlen)(x)
    # global_max_pooling1d

    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    x = Concatenate()([max_pool1, max_pool2])
    #x = Dense(32, activation="relu", name = 'dense_1')(x)
    #x = Dense(16, activation="relu", name = 'dense_2')(x)
    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [15]:
def lstm_model_2(embedding_matrix):
    
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(input_layer)
    x = SpatialDropout1D(0.2)(x)
    # bidirectional lstm
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True), name='bidirectional_lstm')(x)
    # bidirectional gru
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True), name='bidirectional_gru')(x1)
    # attention
    #atten = Attention(step_dim=maxlen)(x)
    # global_max_pooling1d

    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    x = Concatenate()([max_pool1, max_pool2])
    #x = Dense(32, activation="relu", name = 'dense_1')(x)
    #x = Dense(16, activation="relu", name = 'dense_2')(x)
    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [16]:
def lstm_model_3(embedding_matrix):
    
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(input_layer)
    x = SpatialDropout1D(0.2)(x)
    # bidirectional lstm
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True), name='bidirectional_lstm')(x)
    # bidirectional gru
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True), name='bidirectional_gru')(x1)
    # attention
    atten = Attention(step_dim=maxlen)(x)
    # global_max_pooling1d

    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    conc = Dense(16, activation="relu", name = 'dense_1')(conc)
    out = Concatenate(axis=-1)([atten, conc])
    
    #x = Dense(32, activation="relu", name = 'dense_1')(x)
    #x = Dense(16, activation="relu", name = 'dense_2')(x)
    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(out)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [17]:
def lstm_model_4(embedding_matrix):
    
    input_layer = Input(shape=(maxlen,))
    # create embedding layer
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(input_layer)
    x = SpatialDropout1D(0.2)(x)
    # bidirectional lstm
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True), name='bidirectional_lstm')(x)
    # bidirectional gru
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True), name='bidirectional_gru')(x1)
    # attention
    atten = Attention(step_dim=maxlen)(x2)
    # global_max_pooling1d

    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    conc = Dense(8, activation="relu", name = 'dense_1')(conc)
    out = Concatenate(axis=-1)([atten, conc])
    
    #x = Dense(32, activation="relu", name = 'dense_1')(x)
    #x = Dense(16, activation="relu", name = 'dense_2')(x)
    # output layer
    output_layer = Dense(1, activation="sigmoid", name = 'output')(out)
    model = Model(inputs=input_layer, outputs=output_layer)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [18]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [19]:
#https://www.kaggle.com/strideradu/word2vec-and-gensim-go-go-go

def train_pred(model, epochs=2):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y

In [20]:
model = lstm_model(embedding_matrix)
#F1 score at threshold 0.36 is 0.685829670982302

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [21]:
pred_val_y, pred_test_y = train_pred(model, epochs=2) # GloVe only

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1


In [22]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.5966323196783111
F1 score at threshold 0.11 is 0.6056145155768572
F1 score at threshold 0.12 is 0.6125152518738017
F1 score at threshold 0.13 is 0.6177017808097812
F1 score at threshold 0.14 is 0.624165915238954
F1 score at threshold 0.15 is 0.6298332417078981
F1 score at threshold 0.16 is 0.6349353909082458
F1 score at threshold 0.17 is 0.6383460775984141
F1 score at threshold 0.18 is 0.6433767228177641
F1 score at threshold 0.19 is 0.6470303265187483
F1 score at threshold 0.2 is 0.6520585634273361
F1 score at threshold 0.21 is 0.6561225504824431
F1 score at threshold 0.22 is 0.6586272736408401
F1 score at threshold 0.23 is 0.6608536956301329
F1 score at threshold 0.24 is 0.6635963565653464
F1 score at threshold 0.25 is 0.6647380927781796
F1 score at threshold 0.26 is 0.6676415291414247
F1 score at threshold 0.27 is 0.6695514511873351
F1 score at threshold 0.28 is 0.6714255273812061
F1 score at threshold 0.29 is 0.6743311485978296
F1 score at threshold 0

## model 2

In [23]:
model_2 = lstm_model_2(embedding_matrix)

In [24]:
#del model
pred_val_y_2, pred_test_y_2 = train_pred(model_2, epochs=2) # GloVe only

#F1 score at threshold 0.45 is 0.6864916927522274

Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1


In [25]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y_2 > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6266823231867039
F1 score at threshold 0.11 is 0.633940062511491
F1 score at threshold 0.12 is 0.6395435839880285
F1 score at threshold 0.13 is 0.6444190910819547
F1 score at threshold 0.14 is 0.650357764455618
F1 score at threshold 0.15 is 0.653412985995495
F1 score at threshold 0.16 is 0.6585754078790291
F1 score at threshold 0.17 is 0.6606909054285426
F1 score at threshold 0.18 is 0.6631342979417157
F1 score at threshold 0.19 is 0.6655005659018418
F1 score at threshold 0.2 is 0.668538741549662
F1 score at threshold 0.21 is 0.669049621530698
F1 score at threshold 0.22 is 0.6715514495062122
F1 score at threshold 0.23 is 0.673029490616622
F1 score at threshold 0.24 is 0.675259965337955
F1 score at threshold 0.25 is 0.6779921087242438
F1 score at threshold 0.26 is 0.6792118662829312
F1 score at threshold 0.27 is 0.6778336125069794
F1 score at threshold 0.28 is 0.678462752169503
F1 score at threshold 0.29 is 0.6781321184510252
F1 score at threshold 0.3 is 0

## model 3

In [26]:
model_3 = lstm_model_3(embedding_matrix)

In [27]:
pred_val_y_3, pred_test_y_3 = train_pred(model_3, epochs=3) # GloVe only

Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1


In [28]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y_3 > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6219468390804598
F1 score at threshold 0.11 is 0.6283307389433201
F1 score at threshold 0.12 is 0.6342506507995538
F1 score at threshold 0.13 is 0.6391830559757942
F1 score at threshold 0.14 is 0.6408045977011494
F1 score at threshold 0.15 is 0.6451800796039219
F1 score at threshold 0.16 is 0.6481317164546978
F1 score at threshold 0.17 is 0.6501844282723557
F1 score at threshold 0.18 is 0.6527483610690872
F1 score at threshold 0.19 is 0.6555238483489604
F1 score at threshold 0.2 is 0.657457994021235
F1 score at threshold 0.21 is 0.6604147129311244
F1 score at threshold 0.22 is 0.6628751974723539
F1 score at threshold 0.23 is 0.6644680851063829
F1 score at threshold 0.24 is 0.6655913978494624
F1 score at threshold 0.25 is 0.6678950791242142
F1 score at threshold 0.26 is 0.6695100612423446
F1 score at threshold 0.27 is 0.6711587319120733
F1 score at threshold 0.28 is 0.6723811644216855
F1 score at threshold 0.29 is 0.6735565041563694
F1 score at threshold 0

## model 4

In [29]:
model_4 = lstm_model_4(embedding_matrix)

In [30]:
pred_val_y_4, pred_test_y_4 = train_pred(model_4, epochs=2) # GloVe only

Train on 1240815 samples, validate on 65307 samples
Epoch 1/1
Train on 1240815 samples, validate on 65307 samples
Epoch 1/1


In [31]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y_4 > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6109706113194383
F1 score at threshold 0.11 is 0.6186357556843515
F1 score at threshold 0.12 is 0.6268412438625204
F1 score at threshold 0.13 is 0.6336450328673271
F1 score at threshold 0.14 is 0.6389150499152382
F1 score at threshold 0.15 is 0.6445125454893699
F1 score at threshold 0.16 is 0.6487012355287479
F1 score at threshold 0.17 is 0.6539601027059055
F1 score at threshold 0.18 is 0.6573230553608971
F1 score at threshold 0.19 is 0.6603045685279187
F1 score at threshold 0.2 is 0.6638525687223309
F1 score at threshold 0.21 is 0.6629845769070446
F1 score at threshold 0.22 is 0.6650470053871342
F1 score at threshold 0.23 is 0.6669521464511294
F1 score at threshold 0.24 is 0.6676786295131735
F1 score at threshold 0.25 is 0.6687863172897708
F1 score at threshold 0.26 is 0.6707344131351232
F1 score at threshold 0.27 is 0.6733938782374705
F1 score at threshold 0.28 is 0.6742973708068903
F1 score at threshold 0.29 is 0.6768208886852954
F1 score at threshold 

In [32]:
# pred_test_y = (pred_test_y > best_thresh).astype(int)
# test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
# out_df = pd.DataFrame({"qid":test_df["qid"].values})
# out_df['prediction'] = pred_test_y
# out_df.to_csv("submission.csv", index=False)
