**Notebook Objective:**

Objective of the notebook is to look at the different pretrained embeddings provided in the dataset and to see how they are useful in the model building process. 

First let us import the necessary modules and read the input data.

In [None]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input,Conv2D, MaxPool2D ,CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import concatenate
from keras.callbacks import *
from keras.layers import Reshape, Flatten, Concatenate, SpatialDropout1D

from nltk import pos_tag

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna(" ")
test_X = test_df["question_text"].fillna(" ")
text_list = pd.concat([train_X,test_X])

#tokenizer = Tokenizer(num_words=max_features)
#tokenizer.fit_on_texts(list(text_list))


In [None]:
pos_tag(text_list.values[0].split())

In [None]:
import gensim
model_file = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
model = gensim.models.KeyedVectors.load_word2vec_format(model_file)

In [None]:
def most_similar_words(model,words,topn):
    for word in words:
        similar_words = model.most_similar(positive=word,topn=topn)
        l = []
        for sw in similar_words:
            l.append(sw[0])
        print("{} top {} similar words:".format(word,topn),l)
        
most_similar_words(model,["nation"],3)

In [None]:
pos_tag(['nation','country','nations','Nation'])


In [None]:
def create_dict(neg_vocab,embedding_model,max_iter=5000,threshold=0.5,topn_=1):
    dict={}
    itern = min(len(neg_vocab),max_iter)
    for idx, word  in tqdm(enumerate(neg_vocab)):
        if idx > itern:
            break
        try:
            embedding_model[word]
            pos = pos_tag([word])
            replacements = embedding_model.most_similar(positive = word, topn=topn_)
            replacements = [r[0] for r in replacements if r[1]>threshold ]
            replacements = [r for r in replacements if pos_tag([r])[0][1] == pos[0][1]]
            dict.update({word:replacements}) if len(replacements)>0 else dict
        except:
            pass
        
    return dict

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_df[train_df['target']==1].question_text))
neg_vocab = tokenizer.word_index
print(len(neg_vocab))


In [None]:
dictionary = create_dict(neg_vocab,model,topn_=2)

In [None]:
len(dictionary)

In [None]:
sample_neg = list(train_df[train_df['target']==1].question_text)

In [None]:
sample_neg[:5]

In [None]:
def augmenting_sample(dictionary, x):
    aug=[]
    for word in x:
        if word in dictionary:
            replace = dictionary[word]
            aug.append(replace[0])
        else:
            aug.append(word)
            
    return ' '.join(aug)
            
            
augmenting_sample(dictionary,sample_neg[0].split())

In [None]:
def augment(neg):
    augmented=[]
    for sen in tqdm(neg):
        aug = augmenting_sample(dictionary, sen.split())
        augmented.append(aug)
    
    return augmented

augment_samples = augment(sample_neg[:])

In [None]:
import gc
del model
gc.collect()

In [None]:
'''
def augment_samples(model,x):
    def replacements(model,word, threshold):
        try:
            model[word]
            replacements = model.most_similar(positive=word, topn=1)
            sims=[]
            for k,v in replacements:
                if v>threshold:
                    sims.append(str(k))
                else:
                    continue
            result=[]
            pos = pos_tag([word])[0][1]
            
            for similar_word in sims:
                if pos_tag([similar_word])[0][1] == pos:
                    result.append(similar_word)
            return result
        except:
            pass
            
    
    agg=[]
    for w in x:
        replace = replacements(model,w,0.5)
        if replace is not None and len(replace)>0:
            agg.append(replace[0])
        else:
            agg.append(w)
        
    return ' '.join(agg)
'''

In [None]:
#augment_samples(model,sample_neg[0].split())

In [None]:

'''
def augment():
    insincere = (train_df['target']==1)
    tar = train_df[insincere]['question_text'].values
    augmented=[]
    for sen in tqdm(tar):
        aug = augment_samples(model, sen.split())
        augmented.append(aug)
    
    return augmented

a = augment()
'''

In [None]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
def clean_text(x):

    x = str(x)
    for punct in "/-":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")

text_list = text_list.apply(lambda x: clean_text(x))
text_list = list(text_list.values)
#text_list = text_list.apply(lambda x: ps.stem(x))
#text_list = text_list.apply(lambda x: lc.stem(x))
#text_list = text_list.apply(lambda x: sb.stem(x))

In [None]:
#vocab = build_vocab(text_list.apply(lambda x: x.split()).values)

In [None]:
#vocab = sorted(dict(vocab).items(), key=operator.itemgetter(1))[::-1]

#print(vocab[:5])


In [None]:
import operator

def check_converge(vocab,embedding_index):
    iv={}
    oov={}
    iv_acc=0
    oov_acc=0
    for word in vocab:
        try:
            iv[word] = embedding_index[word]
            iv_acc += vocab[word]
            
        except:
            oov[word] = vocab[word]
            oov_acc += vocab[word]
            
    print("{:.2%}".format((len(iv)/len(vocab))))
    print("{:.2%}".format(iv_acc/(iv_acc+oov_acc)))

    
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_x

In [None]:
#oov=check_converge(vocab,embeddings_index)

Next steps are as follows:
 * Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.
 * Fill up the missing values in the text column with '_na_'
 * Tokenize the text column and convert them to vector sequences
 * Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values

test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(text_list))
train_X = tokenizer.texts_to_sequences(train_X)

test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)

test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values

debug=False

if debug:
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)
    val_X = val_df["question_text"].fillna("_na_").values
    val_X = tokenizer.texts_to_sequences(val_X)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    val_y = val_df['target'].values


**Without Pretrained Embeddings:**

Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model. We will not use any pre-trained word embeddings for this model and the embeddings will be learnt from scratch. Please check out the model summary for the details of the layers used. 

In [None]:
'''
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())
'''

Train the model using train sample and monitor the metric on the valid sample. This is just a sample model running for 2 epochs. Changing the epochs, batch_size and model parameters might give us a better model.

In [None]:
## Train the model 
#model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

Now let us get the validation sample predictions and also get the best threshold for F1 score. 

In [None]:
#pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
#for thresh in np.arange(0.1, 0.501, 0.01):
#    thresh = np.round(thresh, 2)
#    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

Now let us get the test set predictions as well and save them

In [None]:
#pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)

Now that our model building is done, it might be a good idea to clean up some memory before we go to the next step.

In [None]:
#del model, inp, x
#import gc; gc.collect()
#time.sleep(10)

In [None]:
def load_glove(word_dict):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    return embedding_matrix, nb_words

In [None]:
def load_fastText(word_dict):
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    return embedding_matrix, nb_words


In [None]:
def load_paragram(word_dict):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_dict
    nb_words = len(word_index)+1
    

    embedding_matrix = np.zeros((nb_words, embed_size),dtype=np.float32) - 1.
    for key, i in tqdm(word_index.items()):
        word = key
        embedding_vector = embeddings_index.get(word)        
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
            
    return embedding_matrix, nb_words

In [None]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
    

In [None]:
clr = CyclicLR(base_lr=0.001, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)

In [None]:
def build_model(embedding_matrix, nb_words, embedding_size=300):
    inp = Input(shape=(maxlen,))
    x = Embedding(nb_words, embedding_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.3)(x)
    x1 = Bidirectional(CuDNNLSTM(256, return_sequences=True))(x)
    x2 = Bidirectional(CuDNNGRU(128, return_sequences=True))(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    max_pool2 = GlobalMaxPooling1D()(x2)
    conc = Concatenate()([max_pool1, max_pool2])
    predictions = Dense(1, activation='sigmoid')(conc)
    model = Model(inputs=inp, outputs=predictions)
    adam = optimizers.Adam()
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=[f1])
    return model

In [None]:

def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
embedding_matrix_glove,nb_words_g = load_glove(tokenizer.word_index)
embedding_matrix_fastText, nb_words_f = load_fastText(tokenizer.word_index)
embedding_matrix = np.concatenate((embedding_matrix_glove,embedding_matrix_fastText),axis=1)
nb_words = nb_words_g

In [None]:
model = build_model(embedding_matrix,nb_words,600)

In [None]:
model.summary()

In [None]:
augment_X=tokenizer.texts_to_sequences(augment_samples)

In [None]:
augment_X=pad_sequences(augment_X, maxlen=maxlen)

In [None]:
augmented_X = np.concatenate((train_X,augment_X),axis=0)

In [None]:
augmented_y = np.concatenate((train_y,np.ones(len(augment_X))),axis=0)

In [None]:
model.fit(augmented_X,augmented_y,batch_size=512,callbacks=[clr,],verbose=1,epochs=4)

In [None]:
pred_test_y = 0.5*model.predict([test_X],batch_size=1024,verbose=1)

In [None]:
import gc
del model,embedding_matrix, embedding_matrix_fastText
gc.collect()

In [None]:
embedding_matrix_paragram,nb_words_p = load_paragram(tokenizer.word_index)
embedding_matrix = np.concatenate((embedding_matrix_glove, embedding_matrix_paragram),axis=1)
nb_words= nb_words_g


In [None]:
embedding_matrix_paragram[0]

In [None]:
model = build_model(embedding_matrix,nb_words,600)

In [None]:
model.summary()

In [None]:
model.fit(augmented_X,augmented_y,batch_size=512,callbacks=[clr,],verbose=1,epochs=4)

In [None]:
pred_test_y += 0.5*model.predict([test_X],batch_size=1024,verbose=1)

In [None]:
'''
from sklearn.model_selection import KFold
def create_meta_feature_train(X_train,y_train,bsize=512,epochs_=1,N_fold=2):
    model_list=[]
    out_of_hold_predictions = np.zeros((y_train.shape[0],1))
    kfold = KFold(n_splits=N_fold, shuffle=True, random_state=156)
    for train_index, holdout_index in kfold.split(X_train, y_train):
        base_model = LSTM_with_Attetion(emb_mean,emb_std,embed_size,embedding_matrix)
        base_model.fit(X_train[train_index],y_train[train_index],batch_size=bsize,epochs=epochs_,callback=[clr,])
        model_list.append(base_model)
        out_of_hold_predictions[holdout_index] = base_model.predict(X_train[holdout_index])
    
    return model_list, out_of_hold_predictions
        
def create_meta_feature_test(model_list, X_test):
    meta_features = np.column_stack([model.predict(X_test) for model in model_list]).mean(axis=1)
    return meta_features
    

glove_m_list, glove_meta_feature_tr = create_meta_feature_train(train_X,train_y)
glove_meta_feature_test = create_meta_feature_test(glove_m_list, test_X)
'''

In [None]:
#pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
#for thresh in np.arange(0.1, 0.501, 0.01):
#    thresh = np.round(thresh, 2)
#    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))))

In [None]:
'''
del word_index, embeddings_index, all_embs, embedding_matrix
for m in glove_m_list:
    del m
del glove_m_list
import gc; gc.collect()
time.sleep(10)
'''

**Wiki News FastText Embeddings:**

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

**Paragram Embeddings:**

In this section, we can use the paragram embeddings and build the model and make predictions.

In [None]:
#pred_paragram_val_y = model.predict([val_X], batch_size=1024, verbose=1)
#for thresh in np.arange(0.1, 0.501, 0.01):
#    thresh = np.round(thresh, 2)
#    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_paragram_val_y>thresh).astype(int))))

In [None]:
#pred_paragram_test_y = model.predict([test_X], batch_size=1024, verbose=1)

In [None]:
#meta_features_train = np.concatenate((glove_meta_feature_tr,FastText_meta_feature_tr,Paragram_meta_feature_tr), axis=1)

In [None]:
#meta_features_test = np.concatenate((glove_meta_feature_test,FastText_meta_feature_test,Paragram_meta_feature_test),axis=1)

Meta-learners: NN, SVM, Catboost(or LightGBM)

In [None]:
#from sklearn import svm
#meta_svm = svm.SVC(gamma='scale')
#meta_svm.fit(meta_features_train, y_train) 
#svm_pred_y = meta_svm.predict(meta_features_test)

In [None]:
#from catboost import CatBoostClassifier
#meta_cat = CatBoostClassifier(thread_count=4, n_estimators=400, max_depth=8, eta=0.1, loss_function='Logloss' , verbose=10)
#meta_cat.fit(meta_features_train,y_train)
#cat_pred_y = meta_cat.predict(meta_features_test)

In [None]:
#pred_test_y = 0.5*svm_pred_y + 0.5*cat_pred_y
pred_test_y = (pred_test_y>0.35).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)

In [None]:
#pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y 
#for thresh in np.arange(0.1, 0.501, 0.01):
#    thresh = np.round(thresh, 2)
#    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_val_y>thresh).astype(int))))

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [None]:
#pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
#pred_test_y = (pred_test_y>0.35).astype(int)
#out_df = pd.DataFrame({"qid":test_df["qid"].values})
#out_df['prediction'] = pred_test_y
#out_df.to_csv("submission.csv", index=False)


**References:**

Thanks to the below kernels which helped me with this one. 
1. https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
2. https://www.kaggle.com/sbongo/do-pretrained-embeddings-give-you-the-extra-edge