In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import time
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

In [None]:
embed_size = 300
max_features = 50000
maxlen = 100

In [None]:
train_X = train_df['question_text'].fillna('_na_').values
val_X = val_df['question_text'].fillna('_na_').values
test_X = test_df['question_text'].fillna('_na_').values

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [None]:
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [None]:
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
def gru_model(max_features, maxlen, emb=None):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=emb)(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(rate=0.1)(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

def train_eval_predict(model, train_X, train_y, val_X, val_y, test_X, batch_size, epochs):
    model.fit(train_X, train_y, batch_size=batch_size, epochs=epochs, validation_data=(val_X, val_y))
    
    pred_val_y = model.predict([val_X], batch_size=batch_size * 2, verbose=1)

    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        print(f'F1 score at threshold {thresh} is {metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))}')
        
    pred_test_y = model.predict([test_X], batch_size=batch_size * 2, verbose=1)
    
    return pred_test_y

def with_embedding(path):
    EMBEDDING_FILE = path

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    embeddings_index = dict(get_coefs(*o.split(' ')) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
def result_model(max_features, maxlen, train_X, train_y, val_X, val_y, test_X, batch_size, epochs, 
                use_emb=False, path=''):
    embedding = None
    if use_emb and path:
        embedding = with_embedding(path)
        embedding = [embedding]
    model = gru_model(max_features, maxlen, embedding)
    pred_test_y = train_eval_predict(model, train_X, train_y, val_X, val_y, test_X, batch_size, epochs)
    return pred_test_y
        
    

In [None]:
pred_noemb_test_y = result_model(max_features, maxlen, train_X, train_y, val_X, val_y, test_X, 512, 2)

In [None]:
pred_glove_test_y = result_model(max_features, maxlen, train_X, train_y, val_X, val_y, test_X, 512, 2,
                                 True, '../input/embeddings/glove.840B.300d/glove.840B.300d.txt')

In [None]:
import gc; gc.collect();
time.sleep(5)

In [None]:
pred_fasttext_test_y = result_model(max_features, maxlen, train_X, train_y, val_X, val_y, test_X, 512, 2,
                                 True, '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec')

In [None]:
pred_paragram_test_y = result_model(max_features, maxlen, train_X, train_y, val_X, val_y, test_X, 512, 2,
                                 True, '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt')

In [None]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)