In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
base_dir = "./input"
print(os.listdir(base_dir))

# Any results you write to the current directory are saved as output.

['embeddings', 'train.csv', 'test.csv', 'sample_submission.csv']


In [2]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

**Load packages and data**

In [3]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

import tensorflow as tf
import keras

Using TensorFlow backend.


In [4]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/paragag/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [6]:
# Don't hog GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
keras.backend.set_session(sess)

In [7]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, " " + punct + " ")
    return x

In [8]:
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

def tag_part_of_speech(text):
    text_splited = text.split(' ')
    text_splited = [''.join(c for c in s if c not in string.punctuation) for s in text_splited]
    text_splited = [s for s in text_splited if s]
    pos_list = pos_tag(text_splited)
    noun_count = len([w for w in pos_list if w[1] in ('NN','NNP','NNPS','NNS')])
    adjective_count = len([w for w in pos_list if w[1] in ('JJ','JJR','JJS')])
    verb_count = len([w for w in pos_list if w[1] in ('VB','VBD','VBG','VBN','VBP','VBZ')])
    return[noun_count, adjective_count, verb_count]

In [9]:
def gen_metadata(df):
    # Pure statistical features
    df['length'] = df['question_text'].progress_apply(lambda x : len(x))
    df['capitals'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/float(row['length']),axis=1)
    df['num_exclamation_marks'] = df['question_text'].progress_apply(lambda comment: comment.count('!'))
    df['num_question_marks'] = df['question_text'].progress_apply(lambda comment: comment.count('?'))
    df['num_punctuation'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    df['num_symbols'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    df['num_words'] = df['question_text'].progress_apply(lambda comment: len(comment.split()))
    df['num_unique_words'] = df['question_text'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    df['num_smilies'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    df['num_sad'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in (':-<', ':()', ';-()', ';(')))
    df['num_chars'] =    df['question_text'].progress_apply(len)

    # POS-based features
    df['nouns'], df['adjectives'], df['verbs'] = zip(*df['question_text'].progress_apply(lambda comment: tag_part_of_speech(comment)))
    df['nouns_vs_length'] = df['nouns'] / df['length']
    df['adjectives_vs_length'] = df['adjectives'] / df['length']
    df['verbs_vs_length'] = df['verbs'] /df['length']
    df['nouns_vs_words'] = df['nouns'] / df['num_words']
    df['adjectives_vs_words'] = df['adjectives'] / df['num_words']
    df['verbs_vs_words'] = df['verbs'] / df['num_words']

    # More Handy Features
    df["count_words_title"] = df["question_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df["mean_word_len"] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df['punct_percent']= df['num_punctuation']*100/df['num_words']

In [10]:
def load_and_prec():
    train_df = pd.read_csv(base_dir + "/train.csv")
    test_df = pd.read_csv(base_dir + "/test.csv")
    
    # Generate metadata
    gen_metadata(test_df)
    gen_metadata(train_df)

    train_df["question_text"] = train_df["question_text"].str.lower()
    test_df["question_text"] = test_df["question_text"].str.lower()
    
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x))
    
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.001, random_state=42) 
    
    # Concatenate all meta-features into one list of features
    metadata_keys = ['length', 'capitals', 'caps_vs_length',
                     'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
                     'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
                     'num_smilies', 'num_sad', 'num_chars', 'nouns', 'adjectives', 'verbs',
                     'nouns_vs_length', 'adjectives_vs_length', 'verbs_vs_length', 'nouns_vs_words', 
                     'adjectives_vs_words', 'verbs_vs_words', 'count_words_title',
                     'mean_word_len', 'punct_percent']
    test_meta  = create_metadata_feature(test_df, metadata_keys)
    val_meta   = create_metadata_feature(val_df, metadata_keys)
    train_meta = create_metadata_feature(train_df, metadata_keys)

    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    val_X = val_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values  
    
    #shuffling the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))

    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]    
    
    return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index, train_meta, val_meta, test_meta

In [11]:
def create_metadata_feature(df, keys):
    data = []
    for key in tqdm(keys):
        data.append(df[key].values)
    data = np.array(data).T
    return data

**Load data**

In [13]:
train_X, val_X, test_X, train_y, val_y, word_index, train_meta, val_meta, test_meta = load_and_prec()
vocab = []
for w,k in word_index.items():
    vocab.append(w)
    if k >= max_features:
        break

100%|██████████| 56370/56370 [00:00<00:00, 378714.40it/s]
100%|██████████| 56370/56370 [00:00<00:00, 98970.55it/s]
100%|██████████| 56370/56370 [00:01<00:00, 31696.18it/s]
100%|██████████| 56370/56370 [00:00<00:00, 281662.24it/s]
100%|██████████| 56370/56370 [00:00<00:00, 276590.22it/s]
100%|██████████| 56370/56370 [00:00<00:00, 196299.81it/s]
100%|██████████| 56370/56370 [00:00<00:00, 209536.96it/s]
100%|██████████| 56370/56370 [00:00<00:00, 247292.00it/s]
100%|██████████| 56370/56370 [00:00<00:00, 147955.52it/s]
100%|██████████| 56370/56370 [00:00<00:00, 245965.81it/s]
100%|██████████| 56370/56370 [00:00<00:00, 314425.13it/s]
100%|██████████| 56370/56370 [00:00<00:00, 370621.11it/s]
100%|██████████| 56370/56370 [00:45<00:00, 1227.81it/s]
100%|██████████| 56370/56370 [00:00<00:00, 256226.41it/s]
100%|██████████| 56370/56370 [00:01<00:00, 34051.78it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 601883.45it/s]
100%|██████████| 1306122/1306122 [00:09<00:00, 143898.21it/s]
100%|██████

Train shape :  (1306122, 28)
Test shape :  (56370, 27)


**Load embeddings**

In [15]:
def load_glove(word_index):
    EMBEDDING_FILE = base_dir + '/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = base_dir + '/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = base_dir + '/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]
    print(emb_mean,emb_std,"para")

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [16]:
embedding_matrix_1 = load_glove(word_index)
# embedding_matrix_2 = load_fasttext(word_index)

In [17]:
embedding_matrix_3 = load_para(word_index)

-0.0053247833 0.49346462 para


In [18]:
# Calculate F-1 score
def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

**CNN Model**

In [19]:
# https://www.kaggle.com/yekenot/2dcnn-textclassifier
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
    
    return model

**Attention layer**

In [20]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

**LSTM models**

In [21]:
def model_lstm_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
    
    return model

In [22]:
def model_gru_srk_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x) # New
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
    
    return model    

In [23]:
def model_lstm_du(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(64, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
    return model

In [24]:
def model_gru_atten_3(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(GRU(128, return_sequences=True))(x)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    x = Bidirectional(GRU(32, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
    
    return model

**Classifier for hand-crafted features**

In [46]:
def model_metafeatures(num_features):
    inp = Input(shape=(num_features,))
    x = Dense(16, activation='relu')(inp)
    x = Dropout(0.3)(x)
    outp = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(1e-2), metrics=[f1_score])
    return model

**Train and predict**

In [50]:
def train_pred(model, epochs=2):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=4096, epochs=1, validation_data=(val_X, val_y))

**Main part: load, train, pred and blend**

In [26]:
# embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2, embedding_matrix_3], axis = 0)
embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_3], axis = 0)
np.shape(embedding_matrix)

(95000, 300)

In [27]:
outputs = []

**Train individual models**

In [47]:
models = [model_gru_atten_3(embedding_matrix),
          model_gru_srk_atten(embedding_matrix),
          model_cnn(embedding_matrix_1),
          model_lstm_du(embedding_matrix),
          model_lstm_atten(embedding_matrix),
          model_lstm_atten(embedding_matrix_1),
          model_lstm_atten(embedding_matrix_3),
          model_metafeatures(train_meta.shape[1])]

In [51]:
train_pred(models[0], epochs = 2)

Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1


In [None]:
train_pred(models[1], epochs = 2)

Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1


In [None]:
train_pred(models[2], epochs = 2) # GloVe only

Train on 1304815 samples, validate on 1307 samples
Epoch 1/1


In [None]:
train_pred(models[3], epochs = 2)

In [None]:
train_pred(models[4], epochs = 2)

In [None]:
train_pred(models[5], epochs = 2) # Only GloVe

In [None]:
train_pred(models[6], epochs = 2) # Only Para

In [None]:
models[7].fit(train_meta, train_y, batch_size=8192, epochs=5, validation_data=(val_meta, val_y)) # Metadata-model

**Freeze weights of individual models. Train ensemble's weights**

In [None]:
def deep_ensemble_model(indi_models, meta_model):
    # Freeze weights of individual models
    for model in indi_models:
        for layer in model.layers:
            layer.trainable = False
    for layer in meta_model.layers:
        layer.trainable = False
    inp = Input(shape=(maxlen,))
    inp_meta = Input(shape=(train_meta.shape[1],))
    x = concatenate([model(inp) for model in indi_models])
    x = concatenate([x, meta_model(inp_meta)])
    out = Dense(1, activation="sigmoid")(x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
    return model

In [None]:
parent_model = deep_ensemble_model(models[:-1], models[-1])
parent_model.fit([train_X, train_meta],
                 train_y,
                 batch_size=4096,
                 epochs=1,
                 validation_data=([val_X, val_meta], val_y))
pred_test_y = parent_model.predict([test_X, test_meta])

In [None]:
# Find best threshold (optimizing on validation set)
best_threshold, best_value = 0, 0
pred_val_y = model.predict(val_X)
for threshold in np.arange(0,1, 1e-3):
    if f1_score(val_y, (pred_val_y > threshold)) > best_value:
        best_threshold = threshold
        best_value = f1_score(val_y, (pred_val_y > threshold))

In [None]:
pred_test_y = (pred_test_y > best_threshold).astype(int)
test_df = pd.read_csv(base_dir +"/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)