In [31]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.models import Model
import numpy as np 
import pandas as pd 
import re
import nltk
from preprocess import *
from models import *

In [32]:
df = pd.read_csv("questions.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [33]:
# question_1, question_2 = df['question1'].to_list(), df['question2'].to_list()
# is_duplicate = df['is_duplicate'].to_list()
# preprocess_neural(question_1, question_2, is_duplicate)

In [34]:
df1 = pd.read_csv("preprocessed_neural.csv")
q1_preprocessed, q2_preprocessed, is_duplicate = df1['question1'].to_list(), df1['question2'].to_list(), df1['is_duplicate'].to_list()

In [35]:
df1.head()

Unnamed: 0,question1,question2,is_duplicate
0,step step guide invest share market india,step step guide invest share market,0
1,story kohinoor kohinoor diamond,would happen indian government stole kohinoor ...,0
2,increase speed internet connection using vpn,internet speed increased hacking dns,0
3,mentally lonely solve,find remainder math2324math divided 2423,0
4,one dissolve water quikly sugar salt methane c...,fish would survive salt water,0


Acquired Test data

In [36]:
MAX_NB_WORDS = 200000
tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts(list(df1['question1'].values.astype(str))+list(df1['question2'].values.astype(str)))

In [37]:

q1_sequence = tokenizer.texts_to_sequences(df['question1'].values.astype(str))
q1_sequence = pad_sequences(q1_sequence, maxlen = 30, padding='post')

q2_sequence = tokenizer.texts_to_sequences(df['question2'].values.astype(str))
q2_sequence = pad_sequences(q2_sequence, maxlen = 30, padding='post')

In [38]:
windex = tokenizer.word_index

In [39]:
embedding_index = {}
with open('glove.6B.300d.txt','r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_index[word] = vectors
    f.close()

In [40]:
embedding_matrix = np.random.random((len(windex)+1, 300))

for word, i in windex.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

print(embedding_matrix.shape)

(108101, 300)


In [41]:
q1_embeddings = []
for i in range(len(q1_sequence)):
    embedding = np.zeros(300)
    for j in range(len(q1_sequence[i])):
        embedding += embedding_matrix[q1_sequence[i][j]]
    embedding /= len(q1_sequence[i])
    q1_embeddings.append(embedding)

q2_embeddings = []
for i in range(len(q2_sequence)):
    embedding = np.zeros(300)
    for j in range(len(q2_sequence[i])):
        embedding += embedding_matrix[q2_sequence[i][j]]
    embedding /= len(q2_sequence[i])
    q2_embeddings.append(embedding)

In [42]:
# change all 0 to 0 1 and 1 to 1 0
for i in range(len(is_duplicate)):
    if is_duplicate[i] == 0:
        is_duplicate[i] = [0,1]
    else:
        is_duplicate[i] = [1,0]

In [43]:
#split the data into 70-20-10 train-validation-test with random state 42
from sklearn.model_selection import train_test_split
q1_train, q1_test, q2_train, q2_test, y_train, y_test = train_test_split(q1_embeddings, q2_embeddings, is_duplicate, test_size=0.1, random_state=42)
q1_train, q1_val, q2_train, q2_val, y_train, y_val = train_test_split(q1_train, q2_train, y_train, test_size=0.222, random_state=42)

In [45]:
#print the ratio of positive and negative samples in train, validation and test
y_train, y_val, y_test = np.array(y_train), np.array(y_val), np.array(y_test)
print("Train: ", sum(y_train)/len(y_train))
print("Validation: ", sum(y_val)/len(y_val))
print("Test: ", sum(y_test)/len(y_test))

Train:  [0.36970243 0.63029757]
Validation:  [0.36774353 0.63225647]
Test:  [0.36907706 0.63092294]


In [46]:
def concatenate_embeddings(q1_embeddings, q2_embeddings):
    embeddings = np.zeros((len(q1_embeddings), 900))
    for i in range(len(q1_embeddings)):
        for j in range(0, 300):
            embeddings[i][j] = q1_embeddings[i][j] + q2_embeddings[i][j]
        for j in range(300, 600):
            embeddings[i][j] = q1_embeddings[i][j-300] - q2_embeddings[i][j-300]
        for j in range(600, 900):
            embeddings[i][j] = q1_embeddings[i][j-600] * q2_embeddings[i][j-600]
    return embeddings
xtrain_concat = concatenate_embeddings(q1_train, q2_train)
xval_concat = concatenate_embeddings(q1_val, q2_val)
xtest_concat = concatenate_embeddings(q1_test, q2_test)

In [47]:
print(xval_concat.shape)
print(xtrain_concat.shape)

(80790, 900)
(283125, 900)


In [58]:
model = CBOW(embedding_matrix, embedding_matrix.shape[0], loss="binary_crossentropy", epochs=50)

In [59]:
xtrain_concat, xval_concat, y_train, y_val = np.array(xtrain_concat), np.array(xval_concat), np.array(y_train), np.array(y_val)
model.fit(xtrain_concat, xval_concat, y_train, y_val)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [60]:
y_pred = model.predict(xtest_concat)
y_pred1d = []
for i in range(len(y_pred)):
    if(y_pred[i][0] > y_pred[i][1]):
        y_pred1d.append(0)
    else:
        y_pred1d.append(1)

y_test1d = []
for i in range(len(y_test)):
    if(y_test[i][0] > y_test[i][1]):
        y_test1d.append(0)
    else:
        y_test1d.append(1)

from sklearn.metrics import accuracy_score, f1_score
print("Accuracy: ", accuracy_score(y_test1d, y_pred1d))
print("F1 Score: ", f1_score(y_test1d, y_pred1d))

Accuracy:  0.7762884558314374
F1 Score:  0.8167044901929
