In [1]:
import pandas as pd
import numpy as np
import re
from keras.models import Model
from keras.layers.core import Dropout, Dense, Flatten, Activation
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.layers.recurrent import GRU
from keras.layers.pooling import MaxPooling2D
from keras.layers import Merge,Input,concatenate


Using TensorFlow backend.


In [2]:
df = pd.read_csv('quora_duplicate_questions.tsv',sep='\t',encoding='utf8')

In [3]:
df = df.dropna()

In [4]:
import spacy
#from gensim.utils import lemmatize
nlp = spacy.load('en')

In [6]:
def clean_data(string):
    #try: 
    rx = re.compile('\W+')
    res = rx.sub(' ',string).strip()
    res = res.replace('\'','')
    #return ' '.join([str(x) for x in nlp(res)])
    #return ' '.join([x.lemma_ for x in nlp(res)])

    return ' '.join([x.lemma_ for x in nlp(res) if not x.is_stop ])

    #except TypeError:
        #print('TypeError',string)

In [7]:
df['q1_clean'] = df['question1'].apply(lambda x: clean_data(x))
df['q2_clean'] = df['question2'].apply(lambda x: clean_data(x))

In [8]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_clean,q2_clean
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor koh noor diamond,happen indian government steal kohinoor koh no...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection vpn,internet speed increase hack dns
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math 23 24 math divide 24 23
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,dissolve water quikly sugar salt methane carbo...,fish survive salt water


In [9]:
print('q1_counts \n',df['q1_clean'].apply(lambda x: len(x.split())).describe())
print('q2_counts \n',df['q2_clean'].apply(lambda x: len(x.split())).describe())

q1_counts 
 count    404288.000000
mean          5.222468
std           2.913420
min           0.000000
25%           3.000000
50%           5.000000
75%           6.000000
max          68.000000
Name: q1_clean, dtype: float64
q2_counts 
 count    404288.000000
mean          5.278838
std           3.182402
min           0.000000
25%           3.000000
50%           4.000000
75%           6.000000
max          92.000000
Name: q2_clean, dtype: float64


In [10]:
def build_vocab(string):
    for x in string.split():
        vocab.add(x)
        lexicon.append(x)
    return

In [11]:
vocab = set()
lexicon = []

In [12]:
_ = df['q1_clean'].apply(lambda x: build_vocab(x))
_ = df['q2_clean'].apply(lambda x: build_vocab(x))

In [13]:
vocab_size = len(vocab)

In [14]:
vocab_size

77092

In [15]:
def generate_labels(inputs, cnter,max_len,vocab_len):
    np_input = []
    
    for sentences in inputs:
        sen  = np.full(max_len,(vocab_len))
        for idx,word in enumerate(sentences.split()):
            if(idx<max_len):
                sen[idx] = cnter.index(word)
        np_input.append(sen)
        
    return np.array(np_input).astype(np.int)
        
        

In [16]:
from collections import Counter
vocab_cnter = Counter(lexicon)

In [17]:
sorted_vocab = [x[0] for x in vocab_cnter.most_common()]

In [18]:
sorted_vocab[:10]

['good', 's', 'india', 'like', 'people', 'way', 't', 'quora', 'learn', 'life']

In [19]:
word2vec = []
for word in sorted_vocab:
    word2vec.append(nlp(word).vector)

In [20]:
len(word2vec)

77092

In [21]:
word2vec.append(np.zeros(300))
word2vec = np.array(word2vec)
word2vec.shape

(77093, 300)

In [22]:
q1_inputs = generate_labels(df['q1_clean'],sorted_vocab,30,len(vocab))
print('q1 processed')
q2_inputs = generate_labels(df['q2_clean'],sorted_vocab,30,len(vocab))

q1 processed


In [40]:
"""
A keras attention layer that wraps RNN layers.
Based on tensorflows [attention_decoder](https://github.com/tensorflow/tensorflow/blob/c8a45a8e236776bed1d14fd71f3b6755bd63cc58/tensorflow/python/ops/seq2seq.py#L506) 
and [Grammar as a Foreign Language](https://arxiv.org/abs/1412.7449).
date: 20161101
author: wassname
url: https://gist.github.com/wassname/5292f95000e409e239b9dc973295327a
"""

from keras import backend as K
from keras.engine import InputSpec
from keras.layers import LSTM, activations, Wrapper, Recurrent

class Attention(Wrapper):
    """
    This wrapper will provide an attention layer to a recurrent layer. 
    
    # Arguments:
        layer: `Recurrent` instance with consume_less='gpu' or 'mem'
    
    # Examples:
    
    ```python
    model = Sequential()
    model.add(LSTM(10, return_sequences=True), batch_input_shape=(4, 5, 10))
    model.add(TFAttentionRNNWrapper(LSTM(10, return_sequences=True, consume_less='gpu')))
    model.add(Dense(5))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop') 
    ```
    
    # References
    - [Grammar as a Foreign Language](https://arxiv.org/abs/1412.7449)
    
    
    """
    def __init__(self, layer, **kwargs):
        assert isinstance(layer, Recurrent)
        #if layer.get_config()['consume_less']=='cpu':
            #raise Exception("AttentionLSTMWrapper doesn't support RNN's with consume_less='cpu'")
        self.supports_masking = True
        super(Attention, self).__init__(layer, **kwargs)

    def build(self, input_shape):
        assert len(input_shape) >= 3
        self.input_spec = [InputSpec(shape=input_shape)]
        nb_samples, nb_time, input_dim = input_shape

        if not self.layer.built:
            self.layer.build(input_shape)
            self.layer.built = True

        super(Attention, self).build()
        
        self.W1 = self.layer.init((input_dim, input_dim, 1, 1), name='{}_W1'.format(self.name))
        self.W2 = self.layer.init((self.layer.output_dim, input_dim), name='{}_W2'.format(self.name))
        self.b2 = K.zeros((input_dim,), name='{}_b2'.format(self.name))
        self.W3 = self.layer.init((input_dim*2, input_dim), name='{}_W3'.format(self.name))
        self.b3 = K.zeros((input_dim,), name='{}_b3'.format(self.name))
        self.V = self.layer.init((input_dim,), name='{}_V'.format(self.name))

        self.trainable_weights = [self.W1, self.W2, self.W3, self.V, self.b2, self.b3]

    def get_output_shape_for(self, input_shape):
        return self.layer.get_output_shape_for(input_shape)

    def step(self, x, states):
        # This is based on [tensorflows implementation](https://github.com/tensorflow/tensorflow/blob/c8a45a8e236776bed1d14fd71f3b6755bd63cc58/tensorflow/python/ops/seq2seq.py#L506).
        # First, we calculate new attention masks:
        #   attn = softmax(V^T * tanh(W2 * X +b2 + W1 * h))
        # and we make the input as a concatenation of the input and weighted inputs which is then
        # transformed back to the shape x of using W3
        #   x = W3*(x+X*attn)+b3
        # Then, we run the cell on a combination of the input and previous attention masks:
        #   h, state = cell(x, h).
        
        nb_samples, nb_time, input_dim = self.input_spec[0].shape
        h = states[0]
        X = states[-1]
        xW1 = states[-2]
        
        Xr = K.reshape(X,(-1,nb_time,1,input_dim))
        hW2 = K.dot(h,self.W2)+self.b2
        hW2 = K.reshape(hW2,(-1,1,1,input_dim)) 
        u = K.tanh(xW1+hW2)
        a = K.sum(self.V*u,[2,3])
        a = K.softmax(a)
        a = K.reshape(a,(-1, nb_time, 1, 1))
        
        # Weight attention vector by attention
        Xa = K.sum(a*Xr,[1,2])
        Xa = K.reshape(Xa,(-1,input_dim))
        
        # Merge input and attention weighted inputs into one vector of the right size.
        x = K.dot(K.concatenate([x,Xa],1),self.W3)+self.b3    
        
        h, new_states = self.layer.step(x, states)
        return h, new_states

    def get_constants(self, x):
        constants = self.layer.get_constants(x)
        
        # Calculate K.dot(x, W2) only once per sequence by making it a constant
        nb_samples, nb_time, input_dim = self.input_spec[0].shape
        Xr = K.reshape(x,(-1,nb_time,input_dim,1))
        Xrt = K.permute_dimensions(Xr, (0, 2, 1, 3))
        xW1t = K.conv2d(Xrt,self.W1,border_mode='same')     
        xW1 = K.permute_dimensions(xW1t, (0, 2, 3, 1))
        constants.append(xW1)
        
        # we need to supply the full sequence of inputs to step (as the attention_vector)
        constants.append(x)
        
        return constants

    def call(self, x, mask=None):
        # input shape: (nb_samples, time (padded with zeros), input_dim)
        input_shape = self.input_spec[0].shape
        if K._BACKEND == 'tensorflow':
            if not input_shape[1]:
                raise Exception('When using TensorFlow, you should define '
                                'explicitly the number of timesteps of '
                                'your sequences.\n'
                                'If your first layer is an Embedding, '
                                'make sure to pass it an "input_length" '
                                'argument. Otherwise, make sure '
                                'the first layer has '
                                'an "input_shape" or "batch_input_shape" '
                                'argument, including the time axis. '
                                'Found input shape at layer ' + self.name +
                                ': ' + str(input_shape))

        if self.layer.stateful:
            initial_states = self.layer.states
        else:
            initial_states = self.layer.get_initial_states(x)
        constants = self.get_constants(x)
        preprocessed_input = self.layer.preprocess_input(x)
        

        last_output, outputs, states = K.rnn(self.step, preprocessed_input,
                                             initial_states,
                                             go_backwards=self.layer.go_backwards,
                                             mask=mask,
                                             constants=constants,
                                             unroll=self.layer.unroll,
                                             input_length=input_shape[1])
        if self.layer.stateful:
            self.updates = []
            for i in range(len(states)):
                self.updates.append((self.layer.states[i], states[i]))

        if self.layer.return_sequences:
            return outputs
        else:
            return last_output
            

In [23]:
from keras.layers import Layer
class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    :param kwargs:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializations.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = K.dot(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        mul_a = uit  * self.u # with this
        ait = K.sum(mul_a, axis=2) # and this

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return input_shape[0], input_shape[-1]
    
    def compute_output_shape(self, input_shape):
        """Shape transformation logic so Keras can infer output shape
        """
        return (input_shape[0], input_shape[-1])


In [24]:
from keras.models import Model
from keras.layers.core import  Lambda,Dropout,Dense, Flatten, Activation
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.layers.convolutional import Conv1D
from keras.layers.recurrent import GRU,LSTM
from keras.layers.pooling import MaxPooling2D
from keras.layers import Concatenate, Input, concatenate 
from keras.layers.normalization import BatchNormalization
from keras import initializers as initializations
from keras import regularizers
from keras import constraints
from keras import backend as K
def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)
def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

# Q1
input_1 = Input(shape=(30,))
embedd = Embedding(len(sorted_vocab)+1,300,
                        weights =[word2vec],
                        name="Embedding_1")
embedding_1 = (embedd)(input_1)

bi_gru_1 = Bidirectional(LSTM(200,name='LSTM_1',dropout=.8))(embedding_1)
#att_1 = AttentionWithContext()(bi_gru_1)
#bi_gru_1_2 =  Bidirectional(LSTM(100,name="LSTM_1_2",dropout=.8,recurrent_dropout=.8,return_sequences=True))(bi_gru_1)
#bi_gru_1_3 =  Bidirectional(LSTM(50,name="LSTM_1_2",dropout=.8,recurrent_dropout=.8))(bi_gru_1_2)
#Q2 
input_2 = Input(shape=(30,))
embedding_2 = (embedd)(input_2)
bi_gru_2 = Bidirectional(LSTM(200,name="LSTM_2",dropout=.8))(embedding_2)
#att_2 = AttentionWithContext()(bi_gru_2)
#bi_gru_2_2 =  Bidirectional(LSTM(100,name="LSTM_2_2",recurrent_dropout=.8,return_sequences=True))(bi_gru_2)
#bi_gru_2_3 = Bidirectional(LSTM(50,name='LSTM_2_3'))(bi_gru_2_2)

#Merge
#cosine distance
#merged_1 = Lambda(cosine_distance,output_shape=cos_dist_output_shape)([att_1,att_2])
#euclid distance
#merged_1 = Flatten()(merged_1)
#print(merged_1)
#merged_2 = Lambda(euclidean_distance,output_shape=eucl_dist_output_shape)([att_1,att_2])
#merged_2 = Flatten()(merged_2)
merged = concatenate([bi_gru_1,bi_gru_2])
#merged = (Flatten())(merged)
#Dense
dense = Dense(4098,activation='relu',name='dense')(merged)
drop_1 = Dropout(.5,name='drop_1')(dense)
drop_1 = BatchNormalization()(drop_1)
dense_2 = Dense(1000,activation='relu',name='dense_2')(drop_1)
drop_2 = Dropout(.5,name='drop_2')(dense_2)
drop_2 = BatchNormalization()(drop_2)
pred = Dense(2,activation="softmax",name='pred')(drop_2)
model = Model(inputs=[input_1,input_2],outputs=pred)
# model = Sequential()
# model.add(Embedding(len(vocab),64,input_length=sentence_len,name='Embedding_Input'))
# model.add(Bidirectional(GRU(100,dropout=.5,name='GRU')))
# model.add(Dense(1000,activation='relu',))

In [25]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
input_2 (InputLayer)             (None, 30)            0                                            
____________________________________________________________________________________________________
Embedding_1 (Embedding)          (None, 30, 300)       23127900                                     
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 400)           801600                                       
___________________________________________________________________________________________

In [26]:
q1_inputs.shape

(404288, 30)

In [None]:
q2_inputs.shape

(404288, 30)

In [61]:
def f1_score(tags, predicted):
    #print(tags)
    #print(K.softmax(predicted))
    tags = set(tags)
    #print(tags)
    predicted = set(K.(predicted))

    tp = len(tags & predicted)
    fp = len(predicted) - tp 
    fn = len(tags) - tp

    if tp>0:
        precision=float(tp)/(tp+fp)
        recall=float(tp)/(tp+fn)
        return 2*((precision*recall)/(precision+recall))
    else:
        return 0

In [63]:

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


In [None]:
from keras import optimizers
import keras
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
adam = optimizers.Adam(lr=.0001)
tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph_2_lstm_with_f1', histogram_freq=0, write_graph=True, write_images=True)
x_train_1,x_test_1,x_train_2,x_test_2,y_train,y_test = train_test_split(q1_inputs,q2_inputs,pd.get_dummies(df['is_duplicate']).values,test_size=.1,random_state=4)
model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])
model.fit(x=[x_train_1,x_train_2],y=y_train,
          batch_size=256,epochs=100,validation_data=[[x_test_1,x_test_2],y_test],callbacks=[tbCallBack,ea])


Train on 363859 samples, validate on 40429 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

In [89]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
def train_loop(model, epochs,feats,labels):
    #permutation = np.array.permutation(len)
    #print(feats)
    x_train_1,x_train_2,x_test_1,x_test_2,y_train,y_test = train_test_split(feats[0],feats[1],labels,test_size=.1,random_state=4)
    for i in range(epochs):
        model.fit(x=x_train,y=y_train, batch_size=64,ephocs=1)
        print('F1:',metrics.f1_score(y_test,(model.predict(x_test))))

    model.save('LSTM_3.p')
    