In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


In [4]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c 'quora-question-pairs'
!unzip '/content/train.csv.zip'  
!unzip '/content/test.csv.zip'  

Downloading test.csv.zip to /content
 92% 105M/114M [00:02<00:00, 48.8MB/s] 
100% 114M/114M [00:02<00:00, 56.4MB/s]
Downloading train.csv.zip to /content
 43% 9.00M/21.2M [00:00<00:00, 19.1MB/s]
100% 21.2M/21.2M [00:00<00:00, 39.2MB/s]
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
Downloading sample_submission.csv.zip to /content
100% 4.95M/4.95M [00:00<00:00, 39.6MB/s]

Archive:  /content/train.csv.zip
  inflating: train.csv               
Archive:  /content/test.csv.zip
  inflating: test.csv                


In [6]:
!wget -P /content/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2020-05-19 12:09:29--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.18.211
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.18.211|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/content/GoogleNews-vectors-negative300.bin.gz’


2020-05-19 12:10:15 (34.6 MB/s) - ‘/content/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



## Importing Libraries


In [5]:
import nltk
import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
import re, nltk, gensim
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,TensorBoard
from tensorflow.python.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense,Input,\
BatchNormalization,Bidirectional,concatenate,Dropout,Conv1D,\
MaxPooling1D,Flatten,add,Lambda
import tensorflow.keras.backend as K

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading Data

In [0]:
def load_data(dataset):
  train=pd.read_csv(dataset)
  train.dropna(axis=0, inplace=True)
  return train

data=load_data('train.csv')
data=data[:10000]

#Creating two list one for left and another for the right question
def list_data(train):
  q1 = pd.Series(train.question1.tolist()).astype(str)
  q2 = pd.Series(train.question2.tolist()).astype(str)
  return q1,q2

q1,q2=list_data(data)

## Counts for positive and negative examples

In [9]:
#Checking for the output counts (Check for data imbalance)
data['is_duplicate'].value_counts()

0    6289
1    3711
Name: is_duplicate, dtype: int64

## Preparing the text data

### Data cleaning

In [10]:
def text_clean(corpus):
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs_list = []
        for word in row.split():
            word = word.lower()
            word = re.sub(r"[^a-zA-Z0-9^.']"," ",word)
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            qs_list.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs_list)))
    return cleaned_corpus

all_corpus = q1.append(q2)
all_corpus = text_clean(all_corpus)

  


In [0]:
#The data is in format like all q1 are the in the starting 
#rows of all_corpus
#then once q1 gets finished, q2 starts. So again 
#separating q1 and q2 and merging them into a data frame.
def clean_data(all_corpus,q1,q2,train):
  q1 = all_corpus[0:q1.shape[0]]
  q2 = all_corpus[q2.shape[0]::]
  data_out = pd.DataFrame({'q1': q1, 'q2': q2})
  data_out.index=list(range(0,len(data_out)))
  data_out['output']=train['is_duplicate']
  return data_out
data_new=clean_data(all_corpus,q1,q2,data)


### Creating word to index

In [0]:
#creating word to index using keras tokenizer
def word_to_index(all_corpus):
  lines = []
  for key in all_corpus:
    lines.append(key)
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return(tokenizer.word_index)
word2index=word_to_index(all_corpus)
index2word = dict((v,k) for k,v in word2index.items())


### Implementing word2vec embedding on text data


In [14]:
# Loading pre-trained word vectors
def load_embedding(EMBEDDING_FILE,embedding_dim):
  word2vec_model = gensim.models.KeyedVectors.\
  load_word2vec_format(EMBEDDING_FILE, binary = True)
  w2v = dict(zip(word2vec_model.wv.index2word,\
                 word2vec_model.wv.syn0))
  
# This will be the embedding matrix
  embeddings = 1 * np.random.randn(len(word2index) \
                                   + 1, embedding_dim)  
  embeddings[0] = 0  # So that the padding will be ignored


# Build the embedding matrix
  for word, index in word2index.items():
      if word in word2vec_model.vocab:
          embeddings[index] = word2vec_model.word_vec(word)
  return embeddings

embedding_dim=300
EMBEDDING_FILE = '/content/GoogleNews-vectors-negative300.bin.gz'

embeddings=load_embedding(EMBEDDING_FILE,embedding_dim)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


### Max length 


In [0]:
def max_length(all_corpus):
  lines=[]
  max_len=-1
  for key in all_corpus:
    for d in key:
      if len(d.split())>max_len:
        max_len=len(d.split())
  return max_len

max_len=max_length(all_corpus)

## Creating training data 


In [0]:
#If len is not equal to max_len then doing post padding
max_len=30
def create_train_data(dataset,max_length,column):
  X1=list()
  for idx in range(len(dataset)):
    for words in (data_new.iloc[idx][[column]].values):
      numeric_seq = [word2index[word] for word \
                     in words.split() if word in word2index]
      in_seq=numeric_seq
      in_seq=pad_sequences([in_seq],maxlen=max_length,\
                           padding='post')[0]
      X1.append(in_seq)
  return X1

q1=np.array(create_train_data(data_new,max_len,'q1'))
q2=np.array(create_train_data(data_new,max_len,'q2'))

## Train Test Split


In [0]:
def split_train_test(q1,q2,data):
  X = np.stack((q1, q2), axis=1)
  X_train, X_test, y_train, y_test = X[:-10], \
  X[-10:],list(data['is_duplicate'])[:-10],list(data['is_duplicate'])[-10:]
  train_q1 = X_train[:,0]
  train_q2 = X_train[:,1]
  test_q1 = X_test[:,0]
  test_q2 = X_test[:,1]
  return train_q1,train_q2,test_q1,test_q2,\
  y_train,y_test,X_train,X_test
train_q1,train_q2,test_q1,test_q2,y_train,\
y_test,X_train,X_test=split_train_test(q1,q2,data)
y_train=np.array(y_train)
y_test=np.array(y_test)

## Euclidean distance and Cosine distance

In [0]:
#Cosine distance
def cosine_distance(output):
  x, y= output[0],output[1]
  x = K.l2_normalize(x, axis=-1)
  y = K.l2_normalize(y, axis=-1)
  return -K.mean(x * y, axis=-1, keepdims=True)

def euclidean_distance(output):
    x, y = output[0],output[1]
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

## Contrastive Loss

In [0]:
def contrastive_loss(y_true, y_pred):
    margin = 1
    return (y_true * K.square(y_pred)\
            + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

def accuracy(y_true, y_pred):
    return K.mean(tf.cast(tf.equal(y_true, tf.cast(y_pred < 0.5, dtype=tf.float32)), dtype=tf.float32))

## First Model Stacked Birectional GRU with cosine distance


In [0]:
def gru_model(input_shape,embeddings,embedding_dim):
  model_input = Input(shape=(input_shape,))
  layer = Embedding(len(embeddings), 
                 embedding_dim, 
                 weights=[embeddings], 
                 input_length=max_len, 
                 trainable=True)(model_input)
  layer = Bidirectional(GRU(200, return_sequences=True))(layer)
  layer = tf.keras.layers.BatchNormalization()(layer)
  layer = Bidirectional(GRU(200,return_sequences=True,dropout=0.2,
                            recurrent_dropout=0.2))(layer)
  layer = tf.keras.layers.BatchNormalization()(layer)
  output = Bidirectional(GRU(200,return_sequences=False,dropout=0.2,
                            recurrent_dropout=0.2))(layer)
  model = Model(inputs=model_input, outputs=output)
  return model

model = gru_model(max_len,embeddings,embedding_dim)

input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

left_out = model(input_q1)
right_out = model(input_q2)

output = Lambda(euclidean_distance, name='euclidean_distance')\
([left_out, right_out])

gru_model = Model(inputs=[input_q1,input_q2], outputs=output)
gru_model.summary()

gru_model.compile(loss=contrastive_loss, optimizer='adam',\
              metrics=[accuracy])
callback = [ModelCheckpoint('question_pairs_weights_gru.h5',\
                            monitor='accuracy', save_best_only=True,mode='max'),
            TensorBoard(log_dir='/content/logs', write_graph=True)
            ]

history = gru_model.fit([train_q1,train_q2],
                    y_train,
                    epochs=10,
                    batch_size=10,
                    callbacks=callback,
                    validation_split=0.05)

## Result

#### Model Prediction

In [45]:
y_pred=gru_model.predict([test_q1,test_q2])
data_new_test=data_new[-10:]
data_new_test['Y_prediction']=[i for i in y_pred]
data_new_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,q1,q2,output,Y_prediction
9990,why does 500 and 1000 rs notes banned by goi a...,what do you think of the decision by the india...,1,[0.38043028]
9991,how do i stop a css layout from distorting whe...,what are the different types of css layouts,0,[0.77639353]
9992,is a world war going to happen,can world war 3 ever take place,1,[0.3749664]
9993,early elizabethan dramalists,what do muslims think of pig slaughter,0,[0.58103967]
9994,what was it like flying first class in the 1990s,what is it like flying first class,0,[0.26551232]
9995,how would you order these four cities bangalo...,what is the cost of living in europe and the u...,0,[1.2808392]
9996,stphen william hawking,what are the differences between sm yg and jy...,0,[1.341012]
9997,mathematical puzzles what is 3...,what are the steps to solve this equation ma...,0,[1.3909037]
9998,is ims noida good for bca,how good is ims noida for studying bca,1,[0.3366593]
9999,what are the most respected and informative te...,what are caltech s required and recommended te...,0,[1.2772219]


## Second model CNN Siamese Network

In [24]:
def cnn_model(input_shape,embeddings,embedding_dim):
  model_input = Input(shape=(input_shape,))
  layer = Embedding(len(embeddings), 
                 embedding_dim, 
                 weights=[embeddings], 
                 input_length=max_len, 
                 trainable=False)(model_input)
  layer = Conv1D(filters=64,kernel_size=3,activation='relu')(layer)
  layer = MaxPooling1D(pool_size=2)(layer)
  layer = Dropout(0.2)(layer)
  layer = Conv1D(filters=64,kernel_size=2,activation='relu')(layer)
  layer = MaxPooling1D(pool_size=2)(layer)
  layer = Dropout(0.2)(layer)
  layer = Conv1D(filters=64,kernel_size=2,activation='relu')(layer)
  layer = MaxPooling1D(pool_size=2)(layer)
  output = Flatten()(layer)
  
  model = Model(inputs=model_input, outputs=output)
  model.summary()
  return model

cnn_model = cnn_model(max_len,embeddings,embedding_dim)

input_q1 = Input(shape=(max_len,))
input_q2 = Input(shape=(max_len,))

left_out = cnn_model(input_q1)
right_out = cnn_model(input_q2)

output = Lambda(euclidean_distance, name='euclidean_distance')\
([left_out, right_out])

cnn_model = Model(inputs=[input_q1,input_q2], outputs=output)
cnn_model.summary()

cnn_model.compile(loss=contrastive_loss, optimizer='adam',\
              metrics=[accuracy])

callback = [ModelCheckpoint('question_pairs_weights_cnn.h5',\
                            monitor='accuracy', save_best_only=True,mode='max')]

history = cnn_model.fit([train_q1,train_q2],
                    y_train,
                    epochs=10,
                    batch_size=10,
                    callbacks=callback)

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 300)           4505400   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 28, 64)            57664     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 14, 64)            0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 14, 64)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 13, 64)            8256      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 6, 64)             0   

### Result

In [26]:
y_pred=cnn_model.predict([test_q1,test_q2])
data_new_test=data_new[-10:]
data_new_test['Y_prediction']=[i for i in y_pred]
data_new_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,q1,q2,output,Y_prediction
9990,why does 500 and 1000 rs notes banned by goi a...,what do you think of the decision by the india...,1,[0.0]
9991,how do i stop a css layout from distorting whe...,what are the different types of css layouts,0,[0.39628065]
9992,is a world war going to happen,can world war 3 ever take place,1,[0.058691166]
9993,early elizabethan dramalists,what do muslims think of pig slaughter,0,[0.33805332]
9994,what was it like flying first class in the 1990s,what is it like flying first class,0,[0.77967125]
9995,how would you order these four cities bangalo...,what is the cost of living in europe and the u...,0,[0.20272695]
9996,stphen william hawking,what are the differences between sm yg and jy...,0,[1.8747106]
9997,mathematical puzzles what is 3...,what are the steps to solve this equation ma...,0,[1.4450378]
9998,is ims noida good for bca,how good is ims noida for studying bca,1,[0.29204148]
9999,what are the most respected and informative te...,what are caltech s required and recommended te...,0,[1.0701656]
