# reference: https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
import pandas as pd
import numpy as np

In [2]:
from __future__ import print_function

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

Using Theano backend.


In [4]:
data=pd.read_csv('train.csv')
test_set=pd.read_csv('test.csv')

In [5]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [6]:
#for python 2, this encoding works with tokenizer, no need to specify unicode or 'utf-8'
data['question1'] = data['question1'].apply(lambda x: (str(x)))
data['question2'] = data['question2'].apply(lambda x: (str(x)))


test_set['question1'] = test_set['question1'].apply(lambda x: (str(x)))
test_set['question2'] = test_set['question2'].apply(lambda x: (str(x)))

In [7]:
target=data['is_duplicate']

In [8]:
question1 = list(data['question1'])
question2 = list(data['question2'])

test1=list(test_set['question1'])
test2=list(test_set['question2'])

In [9]:
print(len(question1))
print(len(test1))

404290
2345796


# tokenize the corpus, then transform sentence into sequences of integer corresponding to tokenizer word index

In [10]:
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(question1+question2+test1+test2)

In [11]:
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index #unique words in corpus (training and test sets)

print("Words in index: %d" % len(word_index))

Words in index: 137077


In [12]:
test1_word_sequences = tokenizer.texts_to_sequences(test1)
test2_word_sequences = tokenizer.texts_to_sequences(test2)


In [14]:
q1_data = pad_sequences(question1_word_sequences, maxlen=25)
q2_data = pad_sequences(question2_word_sequences, maxlen=25)
test1_data=pad_sequences(test1_word_sequences, maxlen=25)
test2_data=pad_sequences(test2_word_sequences, maxlen=25)

labels = np.array(target, dtype=int)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404290, 25)
Shape of question2 data tensor: (404290, 25)
Shape of label tensor: (404290,)


#  word embedding dictionary

In [16]:
embeddings_index = {}
f = open('glove.840B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2196016 word vectors.


In [17]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [18]:
embedding_matrix.shape

(137078, 300)

# save as numpy array

In [19]:
np.save(open('q1_train.npy', 'wb'), q1_data)
np.save(open('q2_train.npy', 'wb'), q2_data)
np.save(open('test1.npy', 'wb'), test1_data)
np.save(open('test2.npy', 'wb'), test2_data)

np.save(open('label_train.npy', 'wb'), labels)
np.save(open('word_embedding_matrix.npy', 'wb'), embedding_matrix)