In [10]:
#!pip install transformers
#!pip install datasets

In [11]:
# Dataframe and computation
import numpy as np
import pandas as pd

# Deep learning libraries
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
#NLTK and regex libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import string

#Sklearn libraries
from sklearn.model_selection import train_test_split

# Downloads for string cleaning
wn = nltk.WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Import and inspect data

In [12]:
df = pd.read_csv('/content/train.csv')

In [13]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


## Function to clean strings

In [14]:
# Cleaning function for the strings
def clean_string(input_str):
    
    # Lowercase the input_string
    input_str = input_str.lower()
    
    # Remove URLs, links
    input_str = re.sub(r"http\S+", "", input_str)
    input_str = re.sub(r"www.\S+", "", input_str)
    input_str = re.sub(r"\S+@\S+", "", input_str)
    
    # Remove punctuations
    input_str_punc = "".join(char for char in input_str if char not in string.punctuation)

    # Remove stopwords
    stopword = nltk.corpus.stopwords.words('english')
    input_str_stopwords = " ".join([word for word in re.split('\W+', input_str_punc) if word not in stopword])
    
    # Lemmatization
    input_str_cleaned = " ".join([wn.lemmatize(word,'n') for word in re.split('\W+', input_str_stopwords)])

    return input_str_cleaned

### Apply cleaning function to data

In [15]:
df["question1"] = df["question1"].apply(lambda x: clean_string(str(x)))
df["question2"] = df["question2"].apply(lambda x: clean_string(str(x)))

### Split the testing and training data

In [16]:
train, test = train_test_split(df, test_size=0.3)

In [17]:
sent_1_train = train["question1"].values
sent_2_train = train["question2"].values
Y_train = train["is_duplicate"].values

In [18]:
sent_1_test = test["question1"].values
sent_2_test = test["question2"].values
Y_test = test["is_duplicate"].values

### tokenizing and padding training/testing data

In [19]:
tokenizer = Tokenizer(num_words = 200000)
tokenizer.fit_on_texts(list(sent_1_train)+list(sent_2_train))

In [20]:

sent_1_train = tokenizer.texts_to_sequences(sent_1_train)
sent_1_train_pad = pad_sequences(sent_1_train, maxlen = 30, padding='post')


sent_2_train = tokenizer.texts_to_sequences(sent_2_train)
sent_2_train_pad = pad_sequences(sent_2_train, maxlen = 30, padding='post')


In [21]:
sent_1_test = tokenizer.texts_to_sequences(sent_1_test)
sent_1_test_pad = pad_sequences(sent_1_test,maxlen = 30, padding='post')

sent_2_test = tokenizer.texts_to_sequences(sent_2_test)
sent_2_test_pad = pad_sequences(sent_2_test, maxlen = 30, padding='post')

### Create glove embeddings

In [22]:
word_index = tokenizer.word_index
embedding_index = {}
with open('/content/glove.6B.200d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()

In [23]:
embedding_matrix = np.random.random((len(word_index)+1, 200))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Tensorflow models

In [42]:
# Question 1 model
model_q1 = tf.keras.Sequential()
model_q1.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q1.add(LSTM(128, activation = 'relu', return_sequences = True))
model_q1.add(Dropout(0.25))
model_q1.add(LSTM(128, return_sequences = True))
model_q1.add(Dropout(0.25))
model_q1.add(Dense(64, activation = 'relu'))
model_q1.add(Dense(2, activation = 'sigmoid'))



In [43]:
# Quesiton 2 model
model_q2 = tf.keras.Sequential()
model_q2.add(Embedding(input_dim = len(word_index)+1,
                       output_dim = 200,
                      weights = [embedding_matrix],
                      input_length = 30))
model_q2.add(LSTM(128, activation = 'relu', return_sequences = True))
model_q2.add(Dropout(0.25))
model_q2.add(LSTM(128, return_sequences = True))
model_q1.add(Dropout(0.25))
model_q2.add(Dense(64, activation = 'relu'))
model_q2.add(Dense(2, activation = 'sigmoid'))



In [44]:
# Merging model output
mergedOut = Multiply()([model_q1.output, model_q2.output])

mergedOut = Flatten()(mergedOut)
mergedOut = Dense(128, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.25)(mergedOut)
mergedOut = Dense(64, activation = 'relu')(mergedOut)
mergedOut = Dropout(0.25)(mergedOut)
mergedOut = Dense(2, activation = 'sigmoid')(mergedOut)

# Train the model

In [45]:
new_model = Model([model_q1.input, model_q2.input], mergedOut)
new_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
history = new_model.fit([sent_1_train_pad,sent_2_train_pad],Y_train, batch_size = 2000, epochs = 6,validation_data=([sent_1_test_pad,sent_2_test_pad],Y_test))

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
