**Refer:**

**Based on**: https://www.kaggle.com/danofer/different-embeddings-with-attention-fork

**SRK**: https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings . There is not much changed from this kernel except the nueral net architecture and final weights of the embeddings.  

**Code for attention layer is taken from Khoi Ngyuen**: https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

**CNN**: https://www.kaggle.com/yekenot/2dcnn-textclassifier

In [None]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input,Conv2D, MaxPool2D ,CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import concatenate
from keras.callbacks import *
from keras.layers import Reshape, Flatten, Concatenate, SpatialDropout1D

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Next steps are as follows:
 * Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
 * Fill up the missing values in the text column with '_na_'
 * Tokenize the text column and convert them to vector sequences
 * Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [None]:
mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))



train_X = train_df["question_text"].fillna(" ")
test_X = test_df["question_text"].fillna(" ")
text_list = pd.concat([train_X,test_X]).apply(lambda x: x.split())


def clean_text(x):

    x = str(x)
    for punct in "/-":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x


text_list = text_list.apply(lambda x: clean_text(x))
text_list = list(text_list.values)



from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")


In [None]:
debug = True

In [None]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use


## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
#val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(text_list))
train_X = tokenizer.texts_to_sequences(train_X)
#val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen,padding='post')
#val_X = pad_sequences(val_X, maxlen=maxlen,padding='post')
test_X = pad_sequences(test_X, maxlen=maxlen,padding='post')

## Get the target values
train_y = train_df['target'].values
#val_y = val_df['target'].values


if debug:
    val_X = val_df["question_text"].fillna("_na_").values
    val_X = tokenizer.texts_to_sequences(val_X)
    val_X = pad_sequences(val_X, maxlen=maxlen,padding='post')
    val_y = val_df['target'].values
    


In [None]:
'''
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_##_").values
val_X = val_df["question_text"].fillna("_##_").values
test_X = test_df["question_text"].fillna("_##_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values
'''

In [None]:
'''
#shuffling the data
np.random.seed(2018)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))

train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]
'''

In [None]:
def load_glove(word_dict):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    return embedding_matrix, nb_words

In [None]:
embedding_matrix_glove,nb_words_g = load_glove(tokenizer.word_index)

In [None]:
'''
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = len(word_index)+1
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
'''

In [None]:
class Attention(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.RandomUniform(seed=10000)
        self.supports_masking = True
        self.return_attention = return_attention
        super(Attention, self).__init__(** kwargs)

    def build(self, input_shape):
        #self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(Attention, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [None]:

# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
    

In [None]:

def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)

In [None]:
#samples_weights = list(np.ones(100))

In [None]:
#from sklearn.utils import class_weight
#sample_weights = class_weight.compute_sample_weight('balanced', train_y)

In [None]:
embedding_matrix = embedding_matrix_glove
nb_words = nb_words_g

In [None]:
if debug:
    inp = Input(shape=(maxlen,))
    
    x = Embedding(nb_words, 300, weights=[embedding_matrix], trainable=False,name='Embedding')(inp)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)

    atten_1, att_1 = Attention(return_attention=True, name='Atten_1')(x) # skip connect
    atten_2, att_2 = Attention(return_attention=True, name='Atten_2')(y)
    max_pool_1 = GlobalMaxPooling1D()(x)
    max_pool_2 = GlobalMaxPooling1D()(y)

    conc = concatenate([atten_1,atten_2,max_pool_1,max_pool_2])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    model.summary()

In [None]:
model.fit(train_X,train_y,batch_size=512,callbacks=[clr,],epochs=2,verbose=1)

In [None]:
# get the symbolic outputs of each "key" layer (we gave them unique names).
layer_dict = dict([(layer.name, layer) for layer in model.layers])

print(layer_dict)

In [None]:
gradient_test=True

In [None]:
import tensorflow as tf

In [None]:
if gradient_test:
    # an input layer to feed labels
    y_true = Input(shape=(1,))
    # compute loss based on model's output and true labels
    ce = K.binary_crossentropy(y_true, model.output)
    # compute gradient of loss with respect to inputs
    grad_ce = K.gradients(ce,[layer_dict['Embedding'].output])
    grad_ce=tf.math.divide(grad_ce,K.sqrt(K.sum(K.sum(K.square(grad_ce),axis=2),axis=1)))
    #grad_ce /= (K.sqrt(K.sum(K.square(grad_ce))) + 1e-5)
    #grad_ce

In [None]:
if gradient_test:
    # create a function to be able to run this computation graph
    func = K.function([layer_dict['Embedding'].output] + [y_true], [grad_ce])

In [None]:
if gradient_test:
    model_intermediate = Model(inputs=model.input, outputs=model.get_layer('Embedding').output)

In [None]:
model_intermediate.predict(train_X[:5])

In [None]:
func([model_intermediate.predict(train_X[:5])]+[train_y[:5]])[0]

In [None]:
def train(dataset, model, batch_size=512, epochs=1):
    for _ in range(epochs):
        keep = True
        while keep:
            (_keep, (train_dt_x,train_dt_y)) = dataset.next_batch(batch_size)
            keep = _keep
            train_dt_emb = model_intermediate.predict(train_dt_x)
            model.fit(train_dt_emb,train_dt_y,callbacks=[clr,],verbose=2,batch_size=512)

In [None]:
def train_with_noise(dataset, model,batch_size=512, epochs=1):
    #print(train_dt_emb.shape)
    for _ in range(epochs):
        keep = True
        while keep:
            (_keep, (train_dt_x,train_dt_y)) = dataset.next_batch(batch_size)
            keep = _keep
            train_dt_emb = model_intermediate.predict(train_dt_x)
            noise_dt = train_dt_emb + func([train_dt_emb]+[train_dt_y])[0][0]
            train_dt_emb = noise_dt
            #print(noise_dt.shape)
            #print(train_dt_y.shape)
            model.fit(train_dt_emb,train_dt_y,callbacks=[clr,],verbose=2,batch_size=512)

In [None]:
class Dataset():
    def __init__(self, data_x, data_y, num_data):
        self.data_x=data_x
        self.data_y=data_y
        self.index_in_epoch=0
        self.num_data = num_data
        self.epochs_completed=0
        
    def next_batch(self,batch_size, shuffle=True):
        start=self.index_in_epoch
        if start==0 and self.epochs_completed==0:
            idx=np.arange(0,self.num_data)
            np.random.shuffle(idx)
            self._data_x = self.data_x[idx]
            self._data_y = self.data_y[idx]
        
        if start+batch_size>self.num_data:
            self.epochs_completed += 1
            rest_num_data = start+batch_size-self.num_data
            d_prev_x = self._data_x[start:self.num_data]
            d_prev_y = self._data_y[start:self.num_data]
            idxnew = np.arange(0,self.num_data)
            np.random.shuffle(idxnew)
            self._data_x=self.data_x[idxnew]
            self._data_y=self.data_y[idxnew]
            self.index_in_epoch=0
            #start=0
            #self.index_in_epoch = rest_num_data
            #end = self.index_in_epoch
            #d_post_x = self._data_x[start:end]
            #d_post_y = self._data_y[start:end]
            
            return (False,(d_prev_x,d_prev_y))
        else:
            end = start+batch_size
            self.index_in_epoch += batch_size
            return (True,(self._data_x[start:end],self._data_y[start:end]))

            
dataset = Dataset(train_X,train_y,len(train_y))
dataset_augmented = Dataset(train_X[train_y==1],train_y[train_y==1],len(train_y[train_y==1]))

In [None]:
def build_model(embedding_matrix, nb_words, embedding_size=300):
    inp = Input(shape=(maxlen,300,))
    x = SpatialDropout1D(0.3)(inp)
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    predictions = Dense(1, activation='sigmoid')(conc)
    model = Model(inputs=inp, outputs=predictions)
    adam = optimizers.Adam()
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=[f1])
    return model

In [None]:
_model = build_model(embedding_matrix,nb_words,300)

In [None]:
train(dataset,_model,5000)

In [None]:
train_with_noise(dataset,_model,2000)

In [None]:
val_X_test = val_X[:100000]

In [None]:
result = np.ones(0)
for i in range(200):
    start = i*500
    end = (i+1)*500
    tmp = model_intermediate.predict(val_X_test[start:end])
    r = _model.predict(tmp)[:,0]
    #print(r.shape)
    result = np.concatenate((result,r))
    
result.shape

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(val_y[:100000],result>0.35)

In [None]:
train(dataset,_model,5000)

In [None]:
result = np.ones(0)
for i in range(200):
    start = i*500
    end = (i+1)*500
    tmp = model_intermediate.predict(val_X_test[start:end])
    r = _model.predict(tmp)[:,0]
    #print(r.shape)
    result = np.concatenate((result,r))
    
result.shape

In [None]:
f1_score(val_y[:100000],result>0.35)

In [None]:
result = np.ones(0)
for i in range(200):
    start = i*500
    end = (i+1)*500
    tmp = model_intermediate.predict(val_X_test[start:end])
    r = _model.predict(tmp)[:,0]
    #print(r.shape)
    result = np.concatenate((result,r))
    
result.shape
f1_score(val_y[:100000],result>0.35)

In [None]:
'''
class AddNoise(Layer):
    def __init__(self, noise_func=None, **kwargs):
        super(AddNoise,self).__init__(**kwargs)
        self.noise_func = noise_func
    
    def build(self,input_shape):
        #batch, 100, 300
        super(AddNoise,self).build(input_shape)
    
    def call(self,x):
        yval=x[1]
        embedding_outputs=x[0]
        noise=noise_func([embedding_outputs]+[yval])
        return K.math.add(embedding_outputs,noise)
    
    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_shape = input_shape[0]
        return output_shape
'''

In [None]:
if debug:
    l = len(pred_val_y)
    slice_arr=np.zeros((l,1))
    for i in range(l):
        slice_arr[i] = pred_val_y[i]==0 and val_y[i]==1

In [None]:
if debug:
    val_X[np.squeeze(slice_arr.astype('bool'))]

In [None]:
if debug:
    debug_input = model.input
    intermediate_output, att = model.get_layer("Atten_1").output
    debug = K.function([debug_input],[intermediate_output] + [att])
    _, weight = debug([val_X[np.squeeze(slice_arr.astype('bool'))]])
    argmx = np.argsort(weight, axis=1)[:,-3:]
    #argmx = np.argmax(weight,axis=1)

In [None]:
argmx

In [None]:
if debug:
    dbg = val_df[np.squeeze(slice_arr.astype('bool'))].question_text.apply(lambda x:x.split()).values
    result = []
    for i,index in enumerate(argmx):    
        temp = []
        for j in index:
            if j >= len(dbg[i]):
                continue
            temp.append(dbg[i][j])
        result.append(temp)
    result

In [None]:
'''
embedding_output = layer_dict['embedding_5'].output
grads = K.gradients(Output_tensor,embedding_output )
grads /= (K.sqrt(K.mean(K.square(grads))) + 1e-5)

iterate = K.function([embedding_output], [loss, grads])

sess = K.get_session()
results = sess.run(iterate, feed_dict={model.input: numpyBatchWithData})
'''

In [None]:
'''
# https://www.kaggle.com/yekenot/2dcnn-textclassifier
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D

filter_sizes = [1,2,3,5]
num_filters = 36

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Reshape((maxlen, embed_size, 1))(x)

maxpool_pool = []
for i in range(len(filter_sizes)):
    conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                 kernel_initializer='he_normal', activation='elu')(x)
    maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

z = Concatenate(axis=1)(maxpool_pool)   
z = Flatten()(z)
z = Dropout(0.1)(z)

outp = Dense(1, activation="sigmoid")(z)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
'''

In [None]:
#pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
#for thresh in np.arange(0.1, 0.501, 0.01):
#    thresh = np.round(thresh, 2)
#    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

In [None]:
pred_glove_test_y = model.predict([test_X], batch_size=1024, verbose=1)

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [None]:
pred_test_y = pred_glove_test_y
pred_test_y = (pred_test_y > 0.33).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)


**References:**

Thanks to the below kernels which helped me with this one. 
1. https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
2. https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge