## A deep cleaning on the data

In [3]:
import numpy as np
import csv, json
from zipfile import ZipFile
from os.path import expanduser, exists
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file

Using TensorFlow backend.


In [1]:
# load data

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# set drive path

path = "/content/drive/My Drive/NLP/Data/train.csv"
df = pd.read_csv(path).fillna("")

In [0]:
question1 = [i for i in df['question1']]
question2 = [i for i in df['question2']]
is_duplicate = [i for i in df['is_duplicate']]

In [0]:
# create tokenized word index

questions = question1 + question2
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

In [20]:
len(word_index)

95596

# GloVe embeddings

In [24]:
# download the 840B 300d glove file

zipfile = ZipFile(get_file('glove.840B.300d.zip', 'http://nlp.stanford.edu/data/glove.840B.300d.zip'))
zipfile.extract('glove.840B.300d.txt', path="/content/drive/My Drive/NLP/Data/")

Downloading data from http://nlp.stanford.edu/data/glove.840B.300d.zip


'/content/drive/My Drive/NLP/Data/glove.840B.300d.txt'

In [29]:
embeddings_index = {}
with open('/content/drive/My Drive/NLP/Data/glove.840B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

Word embeddings: 2196016


In [0]:
# prepare word embedding matrix

nb_words = min(200000, len(word_index))

# set embedding dimension as 300
word_embedding_matrix = np.zeros((nb_words + 1, 300))
for word, i in word_index.items():
    if i > 200000:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [31]:
# prepare training data tensors
# set max sequence length to 25

q1_data = pad_sequences(question1_word_sequences, maxlen=25)
q2_data = pad_sequences(question2_word_sequences, maxlen=25)
labels = np.array(is_duplicate, dtype=int)
print('Shape of question1:', q1_data.shape)
print('Shape of question2:', q2_data.shape)
print('Shape of label:', labels.shape)

Shape of question1: (404290, 25)
Shape of question2: (404290, 25)
Shape of label: (404290,)


In [0]:
np.save(open('/content/drive/My Drive/NLP/Data/q1_train.npy', 'wb'), q1_data)
np.save(open('/content/drive/My Drive/NLP/Data/q2_train.npy', 'wb'), q2_data)
np.save(open('/content/drive/My Drive/NLP/Data/label_train.npy', 'wb'), labels)
np.save(open('/content/drive/My Drive/NLP/Data/word_embedding_matrix.npy', 'wb'), word_embedding_matrix)
with open('/content/drive/My Drive/NLP/Data/nb_words.json', 'w') as f:
    json.dump({'nb_words': nb_words}, f)

In [33]:
pwd

'/content/drive/My Drive/NLP/Data'

In [34]:
ls

glove.840B.300d.txt  nb_words.json  q2_train.npy  word_embedding_matrix.npy
label_train.npy      q1_train.npy   train.csv
