In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/train.csv.zip')

In [3]:
df = df.dropna()

In [4]:
df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [5]:
df.is_duplicate.unique()

array([0, 1])

# VISUALIZE WHAT IS IS_DUPLICATE

### NOT DUPLICATE

In [6]:
df.question1[0]

'What is the step by step guide to invest in share market in india?'

In [7]:
df.question2[0]

'What is the step by step guide to invest in share market?'

### DUPLICATE

In [8]:
df.loc[df['is_duplicate'] == 1].iloc[0].question2

"I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?"

In [9]:
df.loc[df['is_duplicate'] == 1].iloc[0].question1

'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'

# DATA CLEANSING

### REMOVE NON-ASCII CHARACTERS

In [10]:
import re
def remove_non_ascii(text):
    return str(re.sub(r'[^\x00-\x7F]+',' ', str(text)))
    

In [11]:
df['question1_non_ascii'] = df['question1'].apply(lambda x: remove_non_ascii(x))
df['question2_non_ascii'] = df['question2'].apply(lambda x: remove_non_ascii(x))


In [12]:
df.to_csv('/content/preprocessed.csv')
df.head(10)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_non_ascii,question2_non_ascii
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan..."
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0,Should I buy tiago?,What keeps childern active and far from phone ...
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,How can I be a good geologist?,What should I do to be a great geologist?
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,When do you use instead of ?,"When do you use ""&"" instead of ""and""?"
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?


In [13]:
# create list for question1 and question2
#df['combined'] = df['question1_non_ascii'] + df['question2_non_ascii']
labels = df['is_duplicate'].values
corpus1 = list(df['question1_non_ascii'])
corpus2= list(df['question2_non_ascii'])
combined = corpus1+corpus2
df.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_non_ascii,question2_non_ascii
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?


In [14]:
#tokenizer
import tensorflow
import keras.preprocessing.text
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 10000,oov_token = "<00V>")
tokenizer.fit_on_texts(combined)

In [15]:
# padding
import keras.utils 
sequence1 = tokenizer.texts_to_sequences(corpus1)
sequence2 = tokenizer.texts_to_sequences(corpus2)
sequence1 = keras.utils.pad_sequences(sequence1,maxlen = 300, padding = 'post')
sequence2 = keras.utils.pad_sequences(sequence2,maxlen = 300, padding = 'post')


In [16]:
df['seq1'] = list(sequence1)
df['seq2'] = list(sequence2)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_non_ascii,question2_non_ascii,seq1,seq2
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,"[3, 4, 2, 1224, 58, 1224, 2588, 8, 581, 9, 766...","[3, 4, 2, 1224, 58, 1224, 2588, 8, 581, 9, 766..."
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,"[3, 4, 2, 557, 11, 1, 1, 6, 1, 4566, 0, 0, 0, ...","[3, 44, 184, 26, 2, 83, 238, 1, 2, 1, 1, 6, 1,..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,"[5, 14, 6, 219, 2, 441, 11, 18, 362, 1832, 202...","[5, 14, 362, 441, 25, 3340, 58, 1349, 222, 1, ..."
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,"[17, 73, 6, 2779, 314, 2762, 5, 14, 6, 652, 20...","[88, 2, 4174, 38, 231, 2226, 1341, 231, 4, 246..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,"[24, 50, 7124, 9, 232, 1, 1896, 2050, 1, 13, 1...","[24, 1951, 44, 1245, 9, 2050, 232, 0, 0, 0, 0,..."


In [17]:
from numpy import concatenate
import keras
text_input1 = keras.Input(shape=(None,),dtype='int32')
emb1 = keras.layers.Embedding(10000, 64)(text_input1)
encoded_text1 = keras.layers.LSTM(32)(emb1)

text_input2 = keras.Input(shape=(None,),dtype='int32')
emb2 = keras.layers.Embedding(10000, 64)(text_input2)
encoded_text2 = keras.layers.LSTM(32)(emb2)

concat = keras.layers.concatenate([encoded_text1,encoded_text2],axis = -1)

output = keras.layers.Dense(64,activation = 'relu')(concat)
output = keras.layers.Dense(1,activation = 'sigmoid')(output)


In [18]:
model = keras.Model([text_input1,text_input2],output)
model.compile(optimizer = 'adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [74]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_22 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_23 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_18 (Embedding)       (None, None, 64)     640000      ['input_22[0][0]']               
                                                                                                  
 embedding_19 (Embedding)       (None, None, 64)     640000      ['input_23[0][0]']               
                                                                                            

In [19]:
#Split data to train, set

history = model.fit([sequence1,sequence2],labels,epochs = 10,batch_size = 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#refernce: https://www.kaggle.com/code/ratul6/quora-question-pairs-analysis-using-lstm