In [1]:
import pandas as pd
import nltk
import re
import utilities as u
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import collections
from gensim.models import KeyedVectors
EMBEDDING_FILE = '../GoogleNews-vectors-negative300.bin.gz'
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

KeyboardInterrupt: 

In [None]:
train_df = pd.read_csv("../train.csv")
test_df = pd.read_csv("../test.csv")

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

# Identify Feature and Target columns

In [None]:
# Extract feature (X) and target (y) columns
feature_cols = list(train_df.columns[:-1])
target_col = train_df.columns[-1]
print ("Feature Columns {}".format(feature_cols))
print ("Target Columns {}".format(target_col))
X_all = train_df[feature_cols]
y_all = pd.DataFrame(data=train_df[target_col], columns=[target_col])

In [None]:
collections.Counter(y_all['is_duplicate'])

In [None]:
y_all.head(7)

In [None]:
X_all.head(3)

# Split data in training and validation sets

# check whether data is balanced or not

In [None]:

X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
print ("Total data set: {} samples".format(X_all.shape[0]))
print ("Training set: {} samples".format(X_train.shape[0]))
print ("Valid set: {} samples".format(X_valid.shape[0]))

In [None]:
collections.Counter(y_train['is_duplicate'])

In [None]:
collections.Counter(y_valid['is_duplicate'])

In [None]:
X_valid.head(5)

In [None]:
y_valid.head(5)

# Data Preprocessing

In [None]:
def create_vocabulary(words_list,df,column_name):
    for sentence in df[column_name]:
        for word in sentence:
            words_list.append(word)
    return words_list

In [None]:
def preprocess_baseline_text(text):
    text = str(text)
    text = text.lower()
    text = text.split()
    
    return text

In [None]:
preprocess_baseline_text('How do I play Pokémon GO in Korea?')

In [None]:
X_train_subset_df['question1'] = X_train['question1'][0:10]

In [None]:
X_train_subset_df

In [None]:
X_train_subset_df_1 = pd.DataFrame()

In [None]:
X_train_subset_df_1['question1'] = X_train_subset_df['question1'].apply(lambda x:preprocess_baseline_text(x))

In [None]:
X_train_subset_df_1['question1']

In [None]:
X_all_1 = pd.DataFrame()
test_df_1 = pd.DataFrame()

In [None]:
X_all_1['question1'] = X_all['question1'].apply(lambda x:preprocess_baseline_text(x))
X_all_1['question2'] = X_all['question2'].apply(lambda x:preprocess_baseline_text(x))
test_df_1['question1'] = test_df['question1'].apply(lambda x:preprocess_baseline_text(x))
test_df_1['question2'] = test_df['question2'].apply(lambda x:preprocess_baseline_text(x))

In [None]:
X_all_1.head(5)

In [None]:
test_df_1.head(5)

In [None]:
words_list = create_vocabulary([],X_all_1,'question1')
print ("Lenght of words in X_all question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_all_1,'question2')
print ("Lenght of words after adding X_all question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,test_df_1,'question1')
print ("Lenght of words after adding test_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,test_df_1,'question2')
print ("Lenght of words after adding test_df question 2 {}".format(len(words_list)))



In [None]:
print (len(set(words_list)))
print (words_list[0:10])
words_freq = collections.Counter(words_list)
words_freq.most_common(10)
words_freq_10000 = words_freq.most_common(10000)


word_in_word2vec = []
word_notin_word2vec = []

for word in words_freq.most_common(10000):
    if word[0] in model.vocab:
        word_in_word2vec.append(word[0])
    else:
        word_notin_word2vec.append(word[0])
        
print (len(word_in_word2vec))
print (len(word_notin_word2vec))
     
        

In [None]:
print (words_freq_10000[0:100])

In [None]:
print (word_notin_word2vec[0:100])  

In [None]:
print (word_in_word2vec[0:100])

# Further Preprocessing Text

In [None]:
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    
    text = re.sub(r"[^A-Za-z0-9^?,!.\/'+-=]", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    
    text = text.split()
    
    return text

In [None]:
X_train.reset_index(drop = True, inplace = True)
X_valid.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_valid.reset_index(drop = True, inplace = True)

X_train_df = pd.DataFrame()
X_valid_df = pd.DataFrame()
X_test_df = pd.DataFrame()

In [None]:
X_train.head(3)

In [None]:
test_df.head(3)

In [None]:
preprocess_text('What is= the step by step guide to invest in?')

In [None]:
X_train_df['question1'] = X_train['question1'].apply(lambda x:preprocess_text(x))
X_train_df['question2'] = X_train['question2'].apply(lambda x:preprocess_text(x))
X_valid_df['question1'] = X_valid['question1'].apply(lambda x:preprocess_text(x))
X_valid_df['question2'] = X_valid['question2'].apply(lambda x:preprocess_text(x))
X_test_df['question1'] = test_df['question1'].apply(lambda x:preprocess_text(x))
X_test_df['question2'] = test_df['question2'].apply(lambda x:preprocess_text(x))

In [None]:
X_train_df.head(3)

In [237]:
X_train_df['question1'].shape

(283003,)

In [76]:
words_list = create_vocabulary([],X_train_df,'question1')
print ("Lenght of words in X_train_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_train_df,'question2')
print ("Lenght of words after adding X_train_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question1')
print ("Lenght of words after adding X_valid_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question2')
print ("Lenght of words after adding X_valid_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question1')
print ("Lenght of words after adding X_test_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question2')
print ("Lenght of words after adding X_test_df question 2 {}".format(len(words_list)))

Lenght of words in X_train_df question 1 3143028
Lenght of words after adding X_train_df question 2 6358368
Lenght of words after adding X_valid_df question 1 7705257
Lenght of words after adding X_valid_df question 2 9080346
Lenght of words after adding X_test_df question 1 48938305
Lenght of words after adding X_test_df question 2 88928769


In [77]:
print (len(set(words_list)))
print (words_list[0:10])
words_freq = collections.Counter(words_list)
words_freq.most_common(10)
words_freq_10000 = words_freq.most_common(10000)


word_in_word2vec = []
word_notin_word2vec = []

for word in words_freq.most_common(10000):
    if word[0] in model.vocab:
        word_in_word2vec.append(word[0])
    else:
        word_notin_word2vec.append(word[0])
        
print (len(word_in_word2vec))
print (len(word_notin_word2vec))

131743
['how', 'is', 'the', 'working', 'environment', 'at', 'sbi', 'life', 'mumbai', 'how']
9206
794


In [79]:
print (word_notin_word2vec[0:100]) 

['a', 'to', 'of', 'and', '-', 'quora', '2016', '10', 'instagram', '500', '1000', 'whatsapp', '2017', '2015', 'snapchat', '20', ':', '12', '100', '000', '15', '30', '50', 'jio', '12th', 'sbi', '16', '11', 'brexit', '!', '18', 'upsc', 'ece', '13', 'tcs', 'narendra', 'better:', '2014', '25', '17', '14', '70', 'mbbs', 'manipal', '2000', 'gmat', '40', 'iim', '24', 'btech', 'cgpa', '200', 'iiit', 'cgl', '10th', 'obc', 'redmi', 'favourite', '90', '60', 'iits', '21', 'pilani', 'aiims', 'centre', 'mightn', '80', 'flipkart', 'mustn', 'xiaomi', '19', 'travelling', 'ielts', '22', '300', 'india:', 'bba', 'colour', 'ibps', '23', 'ps4', '2013', 'mtech', 'accenture', 'x^2', 'paytm', '25000', 'elon', 'hadoop', 'kohli', 'srm', 'kejriwal', 'bitsat', 'spotify', '11th', 'grey', "'", '32', 'ncr', 'virat']


# Keras Implementation

In [80]:
words_to_index = {}
index_to_words = {}

i = 0
for word in set(words_list):
    words_to_index[word] = i
    i = i + 1
    
j=0
for word in set(words_list):
    index_to_words[j] = word
    j=j+1

In [347]:
with open('words_to_index.pickle', 'wb') as handle:
    pickle.dump(words_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('index_to_words.pickle', 'wb') as handle:
    pickle.dump(index_to_words, handle, protocol=pickle.HIGHEST_PROTOCOL)    
    
    



In [95]:
print (words_to_index['the'])
print (len(words_to_index))
print (index_to_words[104295])
print (index_to_words[0])

104295
131743
the
offical


In [86]:
def word_2_integer(wordslist):
    question2integer = []
    for word in wordslist:
        question2integer.append(words_to_index[word])

    return question2integer

word_2_integer(['what', 'is', 'the', 'story', 'of', 'kohinoor'])

[19780, 69984, 104295, 12947, 38557, 71100]

In [88]:
X_train_df['question1'] = X_train_df['question1'].apply(lambda x:word_2_integer(x))


In [89]:
X_train_df['question2'] = X_train_df['question2'].apply(lambda x:word_2_integer(x))
X_valid_df['question1'] = X_valid_df['question1'].apply(lambda x:word_2_integer(x))
X_valid_df['question2'] = X_valid_df['question2'].apply(lambda x:word_2_integer(x))
X_test_df['question1'] = X_test_df['question1'].apply(lambda x:word_2_integer(x))
X_test_df['question2'] = X_test_df['question2'].apply(lambda x:word_2_integer(x))

In [90]:
X_train_df['question1'].head(5)

0    [31586, 69984, 104295, 57112, 55384, 21628, 63...
1    [31586, 126908, 23532, 84451, 69988, 115658, 9...
2    [19780, 50711, 104295, 116401, 38557, 117493, ...
3    [31586, 19743, 104295, 49658, 18288, 38557, 22...
4    [69984, 104295, 1542, 29811, 72084, 47147, 416...
Name: question1, dtype: object

In [99]:
import numpy as np
embed_length = 300
embed_matrix = np.random.randn(len(words_to_index)+1,embed_length)
# To ignore padding
embed_matrix[0] = 0



In [100]:
print (embed_matrix.shape)

(131744, 300)


In [102]:
print (embed_matrix[1])

[-1.07708578e-01 -9.63583168e-01  1.24014304e+00  2.09291630e+00
  7.82844117e-01  7.95844865e-01  1.83087362e+00  5.86118661e-01
  3.23307614e-01 -3.93995365e-01  2.31290785e+00  5.35420298e-01
 -1.06687072e+00 -4.79405781e-01  1.94332778e+00  9.79287937e-01
  4.90191438e-02 -1.78281724e-01 -2.49248430e+00  1.08787487e+00
 -2.00464048e+00 -1.48177593e+00 -4.66502725e-01 -4.89422453e-01
 -9.32083771e-01 -1.17234855e+00 -2.21491555e+00 -8.81211667e-01
  4.75670694e-01  1.29135833e+00 -8.67405267e-02 -5.58238089e-01
 -6.41726175e-02  2.22534919e-01  1.22458135e+00  3.57438286e-01
 -1.78101599e+00  7.31911884e-01  3.42767622e-02 -5.21363852e-01
 -1.20148561e+00 -8.33140440e-01  4.30584783e-01  1.88149497e+00
 -8.27923703e-01 -1.46507591e-02  6.97847349e-01 -2.26018209e-01
  7.88902921e-02 -5.32683041e-02  2.64329531e-01  1.38200321e+00
 -8.02914468e-01 -7.58184883e-01  2.37706989e-01 -1.96855939e+00
  1.55745681e+00 -8.43194801e-01 -2.00822619e-01  4.85257283e-01
 -1.68236304e-01 -1.15431

In [105]:
#Updating embedding matrix 
count = 0
for word, index in words_to_index.items():
    if word in model.vocab:
        count = count + 1
        embed_matrix[index] = model.word_vec(word)

print (count)


58547


In [106]:
print (len(words_to_index))

131743


In [130]:
max_seq_length = max([
max(list(X_train_df['question1'].map(lambda x: len(x)))),
max(list(X_train_df['question2'].map(lambda x: len(x)))),
max(list(X_valid_df['question1'].map(lambda x: len(x)))),
max(list(X_valid_df['question2'].map(lambda x: len(x)))),
max(list(X_test_df['question1'].map(lambda x: len(x)))),
max(list(X_test_df['question2'].map(lambda x: len(x)))),
])

In [131]:
print (max_seq_length)

242


In [135]:
# Convert labels to their numpy representations
Y_train =  y_train.values
Y_valid =  y_valid.values


In [137]:
assert X_train_dict['left'].shape == X_train_dict['right'].shape

In [138]:
assert len(X_train_dict['left']) == len(Y_train)


In [143]:
#padding to max seq length

In [145]:
X_train_df['question1'][0]

[31586, 69984, 104295, 57112, 55384, 21628, 63296, 131479, 95639]

In [158]:
def pad_left_zeros(question_list,max_seq_length):
    question_list = [0] * (max_seq_length - len(question_list)) + question_list
    return question_list

In [156]:
len (pad_left_zeros([31586, 69984, 104295, 57112, 55384, 21628, 63296, 131479, 95639]))

242

In [161]:
X_train_df['question1'] = X_train_df['question1'].apply(lambda x: pad_left_zeros(x,max_seq_length))

In [163]:
X_train_df['question2'] = X_train_df['question2'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_valid_df['question1'] = X_valid_df['question1'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_valid_df['question2'] = X_valid_df['question2'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_test_df['question1'] = X_test_df['question1'].apply(lambda x: pad_left_zeros(x,max_seq_length))
X_test_df['question2'] = X_test_df['question2'].apply(lambda x: pad_left_zeros(x,max_seq_length))

In [190]:
# Split to dicts
X_train_dict = {'left': X_train_df['question1'], 'right': X_train_df['question2']}
X_valid_dict = {'left': X_valid_df['question1'], 'right': X_valid_df['question2']}
X_test_dict = {'left': X_test_df['question1'], 'right': X_test_df['question2']}

In [170]:
from keras.layers import Input, Embedding, LSTM

In [255]:
import keras
from keras.layers import Input, LSTM, Dense
from keras.models import Model

In [None]:
data_dim = 300
timesteps = 242
nb_classes = 2

In [322]:
# expected input data shape: (batch_size, timesteps, data_dim)
#Input None,max_seq_length,1

left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

In [323]:
embedding_layer = Embedding(len(embed_matrix),
                            embed_length,
                            weights=[embed_matrix],
                            input_length=max_seq_length,
                            trainable=False)

In [324]:
#Embedded version of the inputs
encoded_left = embedding_layer(left_input)
encoded_right = embedding_layer(right_input)

In [325]:
encoded_left.shape

TensorShape([Dimension(None), Dimension(242), Dimension(300)])

In [326]:
# This layer can take as input a matrix
# and will return a vector of size 64
shared_lstm = LSTM(64)

In [327]:
left_output = shared_lstm(encoded_left)
right_output = shared_lstm(encoded_right)


In [328]:
left_output.shape

TensorShape([Dimension(None), Dimension(64)])

In [329]:
# We can then concatenate the two vectors:
merged_vector = keras.layers.concatenate([left_output,right_output], axis=-1)

In [330]:
# And add a logistic regression on top
predictions = Dense(1, activation='sigmoid')(merged_vector)

In [331]:
X_train_dict['left'] = np.concatenate(X_train_dict['left']).reshape(X_train_dict['left'].shape[0],max_seq_length)

In [332]:
X_train_dict['left'].shape

(283003, 242)

In [333]:
X_train_dict['right'] = np.concatenate(X_train_dict['right']).reshape(X_train_dict['right'].shape[0],max_seq_length)

In [334]:
X_train_dict['right'].shape

(283003, 242)

In [337]:
X_valid_dict['left'] = np.concatenate(X_valid_dict['left']).reshape(X_valid_dict['left'].shape[0],max_seq_length)

In [338]:
X_valid_dict['right'] = np.concatenate(X_valid_dict['right']).reshape(X_valid_dict['right'].shape[0],max_seq_length)

In [335]:
X_train_dict['right'][0]

array([     0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
      

In [343]:
from time import time
import datetime

In [340]:

# We define a trainable model linking the
# tweet inputs to the predictions
model = Model(inputs=[left_input, right_input], outputs=predictions)

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

n_epoch = 1
# Start training
training_start_time = time()

model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

Train on 283003 samples, validate on 121287 samples
Epoch 1/1


NameError: name 'n_epoch' is not defined

In [342]:
n_epoch = 4
training_start_time = time()

model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

Train on 283003 samples, validate on 121287 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


NameError: name 'datetime' is not defined

In [344]:
n_epoch = 20
training_start_time = time()

model.fit([X_train_dict['left'], X_train_dict['right']], Y_train, batch_size=128, epochs=n_epoch,
         validation_data=([X_valid_dict['left'], X_valid_dict['right']], Y_valid))

print("Training time finished.\n{} epochs in {}".format(n_epoch, datetime.timedelta(seconds=time()-training_start_time)))

Train on 283003 samples, validate on 121287 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training time finished.
20 epochs in 7:23:34.362409


In [345]:
import pickle
 
# # save the tokenizer and model
# with open("keras_tokenizer.pickle", "wb") as f:
#    pickle.dump(tokenizer, f)
model.save("quora_keras_model_v1.hdf5")

In [348]:
with open('words_to_index.pickle', 'rb') as handle:
    words_to_index_1 = pickle.load(handle)

In [363]:
with open('index_to_words.pickle', 'rb') as handle:
    index_to_words_1 = pickle.load(handle)

In [349]:
print (len(words_to_index_1))

131743


In [364]:
print (len(index_to_words))

131743


In [365]:
print (index_to_words[0])

offical


In [491]:
newtexts = ["How do I read and find my YouTube comments?", "How can I see all my Youtube comments?"]

In [492]:
print (newtexts)

['How do I read and find my YouTube comments?', 'How can I see all my Youtube comments?']


In [493]:

final_new_words_list = []
print (new_words_list)
i = 0
for words_list in newtexts:
    new_words_list = []
    print (words_list)
    for word in preprocess_text(words_list):
        print (word)
        if words_to_index_1.get(word):
            
            new_words_list.append(words_to_index_1.get(word))
        else:
            print (word)
    final_new_words_list.append(np.array(pad_left_zeros(new_words_list,max_seq_length)))
    

[19780, 58041, 96722, 19743, 95526, 83139, 23532, 29267, 92085]
How do I read and find my YouTube comments?
how
do
i
read
and
find
my
youtube
comments
How can I see all my Youtube comments?
how
can
i
see
all
my
youtube
comments


In [494]:
print (final_new_words_list)

[array([     0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
     

In [495]:
from keras.models import load_model

predict_model = load_model("quora_keras_model_v1.hdf5")

In [496]:
y_prob = predict_model.predict([final_new_words_list[0].reshape(1,max_seq_length), final_new_words_list[1].reshape(1,max_seq_length)],batch_size=1, verbose=1, steps=None)



In [497]:
from tensorflow.python.keras import utils

In [498]:
y_classes = y_prob.argmax(axis=-1)

In [499]:
print (y_classes)

[0]


In [500]:
print (y_prob)

[[0.01316933]]


In [None]:
s1rnn = Sequential()
s1rnn.add(embedding_layer_1)
s1rnn.add(LSTM(128, input_shape=(100, 1)))
s1rnn.add(Dense(1))

s2rnn = Sequential()
s2rnn.add(embedding_layer_2)
s2rnn.add(LSTM(128, input_shape=(100, 1)))
s2rnn.add(Dense(1))

In [None]:
preprocess_baseline_text(X_train['question1'][0])

In [None]:
X_test_df.head(5)

In [None]:
X_train_df.head(5)

In [None]:
# re.sub(r"[^A-Za-z0-9,!.\/'+-=]", " ", 'why am i mentally very lonely? how can i solve')

In [None]:
# re.sub(r"\'s", " ", 'what\'s')

In [None]:
# words_list = []
# # train_subset_df = train_df['question1'][0:10]
# train_subset_df['question1'] = pd.DataFrame(data=train_df['question1'][0:10], columns=['question1'])
# train_subset_df['question2'] = pd.DataFrame(data=train_df['question2'][0:10], columns=['question2'])

In [None]:
# for sentence in train_subset_df['question1']:
#     for word in sentence:
#         words_list.append(word)
# print (len(set(words_list)))

## Feature based on how many words are common in question 1 and question 2

In [None]:
# u.termfrequency(['What is the step by step guide to invest in share market in india?'], ['What is the step by step guide to invest in share market?'])

In [None]:
def termfrequency(sentence1, sentence2):
   
    question_dict ={}
    sentence1_words = sentence1   
    sentence2_words = sentence2
    searchtermfreq = []
    i = 0
    
    for key in sentence1_words:
#         print (key)
        question_dict[key] = question_dict.get(key,0) + 1
    
    for key in set(sentence2_words):
        value =  question_dict.get(key,0)
        if value >= 1:
            value = 1
        searchtermfreq.append(value)
        
    
#     print (question_dict)
#     print (searchtermfreq)
#     print (sum(searchtermfreq))
    return sum(searchtermfreq)

termfrequency(['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'], ['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market?'])
    

In [None]:
X_train_df['common_term_freq'] = X_train_df.apply(lambda x: termfrequency(x['question1'],x['question2']), axis=1 )

In [None]:
X_train_df.head(10)

In [None]:
X_valid_df['common_term_freq'] = X_valid_df.apply(lambda x: termfrequency(x['question1'],x['question2']), axis=1 )

In [None]:
X_valid_df.head(10)

# Total words frequency

In [None]:
def total_words_freq(sentence):
    return len(sentence)

In [None]:
total_words_freq(['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india?'])

In [None]:
X_train_df['question1_words_freq'] = X_train_df['question1'].map(lambda x: total_words_freq(x))

In [None]:
X_train_df['question2_words_freq'] = X_train_df['question2'].map(lambda x: total_words_freq(x))

In [None]:
X_train_df.head(5)

In [None]:
X_valid_df['question1_words_freq'] = X_valid_df['question1'].map(lambda x: total_words_freq(x))

In [None]:
X_valid_df['question2_words_freq'] = X_valid_df['question2'].map(lambda x: total_words_freq(x))

In [None]:
X_valid_df.head(5)

In [None]:
X_train_model_input = X_train_df.drop(['question1','question2'],axis =1)

In [None]:
X_valid_model_input = X_valid_df.drop(['question1','question2'],axis =1)

In [None]:
X_train_model_input.head(3)

# Baseline Model

In [None]:

# Train a model# Train  
import time

def train_classifier(clf, X_train, y_train):
    print ("Training {}...".format(clf.__class__.__name__))
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print ("Done!\nTraining time (secs): {:.3f}".format(end - start))

In [None]:
from sklearn.linear_model import LogisticRegression

clf =  LogisticRegression()

In [None]:
train_classifier(clf, X_train_model_input, y_train.values.ravel())

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

def predict_labels(clf, X_train, y_train):
    print ("Predicting labels using {}...".format(clf.__class__.__name__))
    start = time.time()
    y_pred = clf.predict(X_train)
    end = time.time()
    print ("Done!\nPrediction time (secs): {:.3f}".format(end - start))
    return log_loss(y_train, y_pred, eps=1e-15), confusion_matrix(y_train, y_pred)

train_metrics = predict_labels(clf, X_train_model_input, y_train.values.ravel())

print 
print ("Log loss for training set: {}".format(train_metrics[0]))

print ("Confusion matrix for training set: {}".format(train_metrics[1]))

In [None]:
# Predict on test data
print ("Log loss for validation set: {}".format(predict_labels(clf, X_valid_model_input, y_valid.values.ravel())[0]))

In [None]:

print ("Confusion matrix for validation set: {}".format(predict_labels(clf, X_valid_model_input, y_valid.values.ravel())[1]))

# Text Preprocessing

In [None]:
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

In [None]:
words_list = create_vocabulary([],X_train_df,'question1')
print ("Lenght of words in X_train_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_train_df,'question2')
print ("Lenght of words after adding X_train_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question1')
print ("Lenght of words after adding X_valid_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_valid_df,'question2')
print ("Lenght of words after adding X_valid_df question 2 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question1')
print ("Lenght of words after adding X_test_df question 1 {}".format(len(words_list)))
words_list = create_vocabulary(words_list,X_test_df,'question2')
print ("Lenght of words after adding X_test_df question 2 {}".format(len(words_list)))



In [None]:

def preprocess_text(list_words):
     list_words_processed = []
     for text in list_words:
         text = re.sub(r"\?", '', text)
         text = re.sub(r"i'm", "i am ", text)
#          print (text)
         list_words_processed.append(str(text))
#          print (list_words_processed)
     return list_words_processed

In [None]:
preprocess_text(['India?'])

In [None]:
preprocess_text(['how',
 'do',
 'the',
 'holy',
 'scriptures',
 'of',
 'hinduism',
 'compare',
 'and',
 'contrast',
 'to',
 'those',
 'of',
 'taoism?'])

In [None]:
X_train_df['question1'][3]

In [None]:
preprocess_text(X_train_df['question1'][3])

In [None]:
X_train_p_df = pd.DataFrame()
X_valid_p_df = pd.DataFrame()
X_test_p_df = pd.DataFrame()

In [None]:
X_train_p_df['question1'] = X_train_df['question1'].apply(lambda x:preprocess_text(x))
X_train_p_df['question2'] = X_train_df['question2'].apply(lambda x:preprocess_text(x))
X_valid_p_df['question1'] = X_valid_df['question1'].apply(lambda x:preprocess_text(x))
X_valid_p_df['question2'] = X_valid_df['question2'].apply(lambda x:preprocess_text(x))
X_test_p_df['question1'] = X_test_df['question1'].apply(lambda x:preprocess_text(x))
X_test_p_df['question2'] = X_test_df['question2'].apply(lambda x:preprocess_text(x))

In [None]:
X_train_p_df['question1'][2]

In [None]:
proc_words_list = create_vocabulary([],X_train_p_df,'question1')
print ("Lenght of words in X_train_df question 1 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_train_p_df,'question2')
print ("Lenght of words after adding X_train_df question 2 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_valid_p_df,'question1')
print ("Lenght of words after adding X_valid_df question 1 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_valid_p_df,'question2')
print ("Lenght of words after adding X_valid_df question 2 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_test_p_df,'question1')
print ("Lenght of words after adding X_test_df question 1 {}".format(len(words_list)))
proc_words_list = create_vocabulary(words_list,X_test_p_df,'question2')
print ("Lenght of words after adding X_test_df question 2 {}".format(len(words_list)))



In [None]:
proc_words_freq = collections.Counter(proc_words_list)

In [None]:
proc_words_freq_10000 = proc_words_freq.most_common(10000)

In [None]:
proc_word_in_word2vec = []
proc_word_notin_word2vec = []

for word in proc_words_freq.most_common(10000):
    if word[0] in model.vocab:
        proc_word_in_word2vec.append(word[0])
    else:
        proc_word_notin_word2vec.append(word[0])

In [None]:
print (len(proc_word_in_word2vec))
print (len(proc_word_notin_word2vec))
print (proc_word_notin_word2vec[0:100])

In [None]:
print (proc_word_in_word2vec[0:100])

# Pipeline

# Keras

## Create Vocabulary

In [None]:
#