In [1]:
import pandas as pd
import numpy as np
import pickle

# preprocessing
from sklearn import preprocessing

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import gensim
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.utils import simple_preprocess

Using TensorFlow backend.


In [36]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

## Encode labels

In [37]:
le = preprocessing.LabelEncoder()
labels = le.fit_transform(train_data['author'])

## Split into train and validation datasets

In [38]:
val_data = train_data.sample(frac=0.2, random_state=42)
train_data = train_data.drop(val_data.index)

## Convert to word to index mapping

In [39]:
texts = train_data.text
NUM_WORDS = 20000

tokenizer = Tokenizer(num_words=NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(texts)

# transfer sentences into sequences of word indexes
sequences_train = tokenizer.texts_to_sequences(texts)
sequences_valid = tokenizer.texts_to_sequences(val_data.text)
sequences_test = tokenizer.texts_to_sequences(test_data.text)

word_index = tokenizer.word_index

print(sequences_train[0])
print('\n')
print(sequences_valid[0])
print('\n')
print('Found %s unique tokens.' % len(word_index))

[11, 89, 122, 817, 4, 22, 9, 1, 5591, 81, 28, 6, 443, 2881]


[1, 1141, 4657, 3, 1, 408, 16832, 181, 2, 1, 2181, 9344, 3, 3783, 32, 62, 121, 43, 1, 1923]


Found 23067 unique tokens.


In [42]:
X_train = pad_sequences(sequences_train)
X_val = pad_sequences(sequences_valid, maxlen=X_train.shape[1])
X_test = pad_sequences(sequences_test, maxlen=X_train.shape[1])

y_train = to_categorical(np.asarray(labels[train_data.index]))
y_val = to_categorical(np.asarray(labels[val_data.index]))

print('Shape of X train: {0} and X validation tensor: {1}'.format(X_train.shape, X_val.shape) )
print('Shape of label train: {0}  and validation tensor: {1}'.format(y_train.shape, y_val.shape) )
print('Shape of X_test : {0}'.format(X_test.shape) )

Shape of X train: (15663, 861) and X validation tensor: (3916, 861)
Shape of label train: (15663, 3)  and validation tensor: (3916, 3)
Shape of X_test : (8392, 861)


In [9]:
# Load pretrained word vectors
word_vectors = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

EMBEDDING_DIM = 300

# vocab size will be either size of word_index, or Num_words (whichever is smaller)
vocabulary_size = min(len(word_index) + 1, NUM_WORDS) 

embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

In [10]:
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        # get vector for each word 
        embedding_vector = word_vectors[word]
        # save vector into embedding matrix
        embedding_matrix[i] = embedding_vector
    except KeyError:
        # generate random vector if the word was not found in pretrained vectors
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

# to free up some memory
del(word_vectors)

In [27]:
# save embedding matrix
np.save('data/processed/embedding_matrix', embedding_matrix)
# emb = np.load('data/processed/embedding_matrix' + '.npy')

In [10]:
# save dictionary mapping words to indexes (should not be needed, but jus)
with open('data/processed/word2idx.pickle', 'wb') as file:
    pickle.dump(word_index, file, protocol=pickle.HIGHEST_PROTOCOL)
    file.close()

In [11]:
# save tokenizer object for predictions
with open('tokenizer.pickle', 'wb') as file:
    pickle.dump(tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)
    file.close()

In [43]:
# save all data
np.save('data/processed/X_train', X_train)
np.save('data/processed/y_train', y_train)

np.save('data/processed/X_val', X_val)
np.save('data/processed/y_val', y_val)

np.save('data/processed/X_test', X_test)