## Questions with same meaning?
This project uses the dataset containing many questions with label whether they are the same meaning.

In [147]:
import numpy as np
import pandas as pd
import os
import csv
import codecs
from collections import Counter
import matplotlib.pyplot as plt
import operator
import re
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.porter import *

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, merge, LSTM, Lambda, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K
import sys
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.20

### Read Data & primary processing

In [92]:
# All data files are in input dir.
def Get_Data_Location(File_Dir, Trainset_name, Testset_name):
    Train_File = File_Dir + Trainset_name
    Test_File = File_Dir + Testset_name
    return Train_File, Test_File

def Read_Data(Train_File, Test_File):
    data_train = pd.read_csv(Train_File)
    data_test = pd.read_csv(Test_File, nrows=20000)
    data_train.drop_duplicates(inplace=True)
    data_train.dropna(inplace=True)
    data_test.drop_duplicates(inplace=True)
    data_test.dropna(inplace=True)
    print ("Shape of train File = ", data_train.shape)
    print ("Shape of test File = ", data_test.shape)
    print (data_train.head(2))
    print (data_test.head(2))
    return data_train, data_test

In [99]:
Vector_DIR = './input/'
print('Indexing word vectors.')
embeddings_index = {}
f = codecs.open(os.path.join(Vector_DIR, 'vectors.txt'), encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 16143 word vectors.


In [94]:
Train_File, Test_File = Get_Data_Location("./input/", "train.csv", "test.csv")
data_train, data_test = Read_Data(Train_File, Test_File)

Shape of train File =  (404288, 6)
Shape of test File =  (20000, 3)
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
   test_id                                          question1  \
0        0  How does the Surface Pro himself 4 compare wit...   
1        1  Should I have a hair transplant at age 24? How...   

                                           question2  
0  Why did Microsoft choose core m3 and not core ...  
1        How much cost does hair transplant require?  


Remove the duplicated questions and get the statistic value of words.

In [95]:
def statistic_traindata(data_train):
    Num_not_duplicate = data_train.is_duplicate.value_counts()[0]
    Num_duplicate = data_train.is_duplicate.value_counts()[1]
    print("Num_not_duplicate =", Num_not_duplicate)
    print("Num_duplicate =", Num_duplicate)

    total_question_list = data_train.qid1.tolist() + data_train.qid2.tolist()
    total_unique_question = np.unique(total_question_list)
    print("Total number of unique question =", len(total_unique_question))

    question_ids_counter = Counter(total_question_list)
    More_than_once_question = []
    for i in question_ids_counter.values():
        if i > 1:
            More_than_once_question.append(i)
    print("Number of questions more than once =", len(More_than_once_question))
    return Num_not_duplicate, Num_duplicate, total_unique_question, More_than_once_question

In [96]:
Num_not_duplicate, Num_duplicate, total_unique_question, More_than_once_question = statistic_traindata(data_train)

Num_not_duplicate = 255025
Num_duplicate = 149263
Total number of unique question = 537931
Number of questions more than once = 111778


###  Generate dictionary
Here, we tokenize the sentences to get words from the questions.
Then, use porter stemmer to break down words into their basic form.
Also use NLTK stopwords to ignore basic words and genism to train dictionary.

In [97]:
words = re.compile(r"\w+",re.I)
stopword = stopwords.words('english')
stemmer = PorterStemmer()

def tokenize_questions(df):
    question_1_tokenized = []
    question_2_tokenized = []

    for q in df.question1.tolist():
        question_1_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    for q in df.question2.tolist():
        question_2_tokenized.append([stemmer.stem(i.lower()) for i in words.findall(q) if i not in stopword])

    df["Question_1_tok"] = question_1_tokenized
    df["Question_2_tok"] = question_2_tokenized
    
    return df

def train_dictionary(df):
    
    questions_tokenized = df.Question_1_tok.tolist() + df.Question_2_tok.tolist()
    
    dictionary = corpora.Dictionary(questions_tokenized)
    dictionary.filter_extremes(no_below=5, no_above=0.8)
    dictionary.compactify()
    
    return dictionary
    
data_train = tokenize_questions(data_train)
dictionary = train_dictionary(data_train)
print ("No. of words in the dictionary = %s" %len(dictionary.token2id))

data_test = tokenize_questions(data_test)

No. of words in the dictionary = 21461


In [126]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data_train.Question_1_tok.tolist() + data_train.Question_2_tok.tolist() + data_test.Question_1_tok.tolist() + data_test.Question_2_tok.tolist())
sequences_1 = tokenizer.texts_to_sequences(data_train.Question_1_tok.tolist())
sequences_2 = tokenizer.texts_to_sequences(data_train.Question_2_tok.tolist())
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_sequences_1 = tokenizer.texts_to_sequences(data_test.Question_1_tok.tolist())
test_sequences_2 = tokenizer.texts_to_sequences(data_test.Question_2_tok.tolist())

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(data_train.is_duplicate.tolist())
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_labels = np.array(data_test.test_id.tolist())
# del test_sequences_1
# del test_sequences_2
# del sequences_1
# del sequences_2
import gc
gc.collect()

Found 67564 unique tokens.
Shape of data tensor: (404288, 30)
Shape of label tensor: (404288,)


0

In [50]:
test_labels = np.array(data_test.test_id.tolist())
# del test_sequences_1
# del test_sequences_2
# del sequences_1
# del sequences_2
import gc
gc.collect()

802

In [53]:
data_1[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          1,  624,  624, 2151,  206,  447,  158,   10], dtype=int32)

In [140]:
print('Preparing embedding matrix.')
# prepare embedding matrix
# nb_words = len(word_index)
nb_words = min(MAX_NB_WORDS, len(word_index))

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix.
Null word embeddings: 51423


In [141]:
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [142]:
from keras.layers.merge import add, concatenate
# Model Architecture #
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = Conv1D(128, 3, activation='relu')(embedded_sequences_1)
x1 = MaxPooling1D(10)(x1)
x1 = Flatten()(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = Conv1D(128, 3, activation='relu')(embedded_sequences_2)
y1 = MaxPooling1D(10)(y1)
y1 = Flatten()(y1)
y1 = Dense(64, activation='relu')(y1)
y1 = Dropout(0.2)(y1)

# merged = merge([x1,y1], mode='concat')
merged = concatenate([x1,y1])
merged = BatchNormalization()(merged)
merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.2)(merged)
merged = BatchNormalization()(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence_1_input,sequence_2_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [150]:
# word_index['NULL_Value'] = 0
# word_index
out_df.to_csv("test_predictions.csv", index=False)

In [148]:
# pass
model.fit([data_1,data_2], labels, validation_split=VALIDATION_SPLIT, epochs=1, batch_size=1024, shuffle=True)
preds = model.predict([test_data_1, test_data_2])
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)

Train on 323430 samples, validate on 80858 samples
Epoch 1/1
(20000, 1)


In [144]:
print(preds.shape)

out_df = pd.DataFrame({"test_id":test_labels, "is_duplicate":preds.ravel()})
out_df.to_csv("test_predictions.csv", index=False)

(20000, 1)


In [55]:
del test_sequences_1
del test_sequences_2
del sequences_1
del sequences_2

In [56]:
gc.collect()

93

In [98]:
with open("vectors_test.txt","w") as f:
    for i in data_train.Question_1_tok.tolist():
        for j in i:
            f.write(j+" ")
    for i in data_train.Question_2_tok.tolist():
        for j in i:
            f.write(j+" ")
            

In [30]:
data_test

Unnamed: 0,test_id,question1,question2,Question_1_tok,Question_2_tok
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,"[how, surfac, pro, 4, compar, ipad, pro]","[whi, microsoft, choos, core, m3, core, i3, ho..."
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,"[should, i, hair, transplant, age, 24, how, mu...","[how, much, cost, hair, transplant, requir]"
2,2,What but is the best way to send money from Ch...,What you send money to China?,"[what, best, way, send, money, china, us]","[what, send, money, china]"
3,3,Which food not emulsifiers?,What foods fibre?,"[which, food, emulsifi]","[what, food, fibr]"
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,"[how, aberystwyth, start, read]","[how, i, start, read]"
5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...,"[how, two, wheeler, insur, bharti, axa, insur]","[i, admir, i, consid, buy, insur]"
6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?,"[how, i, reduc, belli, fat, diet]","[how, i, reduc, lower, belli, fat, one, month]"
7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...,"[by, scrap, 500, 1000, rupe, note, rbi, plan, ...","[how, recent, move, declar, 500, 1000, denomin..."
8,8,What are the how best books of all time?,What are some of the military history books of...,"[what, best, book, time]","[what, militari, histori, book, time]"
9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?,"[after, 12th, year, old, boy, i, sex, 12, year...","[can, 14, old, guy, date, 12, year, old, girl]"


In [34]:
data_train

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,Question_1_tok,Question_2_tok
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"[what, step, step, guid, invest, share, market...","[what, step, step, guid, invest, share, market]"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"[what, stori, kohinoor, koh, noor, diamond]","[what, would, happen, indian, govern, stole, k..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"[how, i, increas, speed, internet, connect, us...","[how, internet, speed, increas, hack, dn]"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"[whi, i, mental, lone, how, i, solv]","[find, remaind, math, 23, 24, math, divid, 24,..."
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"[which, one, dissolv, water, quikli, sugar, sa...","[which, fish, would, surviv, salt, water]"
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1,"[astrolog, i, capricorn, sun, cap, moon, cap, ...","[i, tripl, capricorn, sun, moon, ascend, capri..."
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0,"[should, i, buy, tiago]","[what, keep, childern, activ, far, phone, vide..."
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,"[how, i, good, geologist]","[what, i, great, geologist]"
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,"[when, use, シ, instead, し]","[when, use, instead]"
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0,"[motorola, compani, can, i, hack, charter, mot...","[how, i, hack, motorola, dcx3400, free, internet]"


In [29]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x16ec5cf28>

In [32]:
data_test.Question_1_tok[0]

['how', 'surfac', 'pro', '4', 'compar', 'ipad', 'pro']

In [33]:
data_test.Question_1_tok[1]

['should',
 'i',
 'hair',
 'transplant',
 'age',
 '24',
 'how',
 'much',
 'would',
 'cost']

In [36]:
type(data_test.Question_1_tok)

pandas.core.series.Series

In [151]:
t=data_test.Question_1_tok.tolist()

In [62]:
data_1

array([[    0,     0,     0, ...,   447,   158,    10],
       [    0,     0,     0, ...,  8885, 16602,  2999],
       [    0,     0,     0, ...,   453,     9,  2428],
       ...,
       [    0,     0,     0, ...,     1,    18,  1873],
       [    0,     0,     0, ...,  2984,    40,    92],
       [    0,     0,     0, ...,    11,   127,  3314]], dtype=int32)