In [1]:
import os
import pickle
import numpy as np

In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.2
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [3]:
embeddings_index = {}
with open( 'glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [4]:
with open('nodeMeta.pkl', 'rb') as f:
    nodeMeta = pickle.load(f)

In [5]:
corpus = []
for i in nodeMeta:
    corpus.append(nodeMeta[i]["abstract"])
print('Found %s texts.' % len(corpus))

Found 17500 texts.


In [6]:
corpus[3][:10]

['perform',
 'sewing',
 'two',
 'dual',
 'ramond',
 'reggeon',
 'vertices',
 'derive',
 'algorithm',
 'means']

In [7]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
t = Tokenizer()
t.fit_on_texts(corpus)
print (len(t.word_counts), 'words')

19801 words


In [8]:
vocab2idx = dict((i, t.word_index[i]) for i in t.word_index)
idx2vocab = dict((t.word_index[i], i) for i in t.word_index)
idx2vocab[0] = "<pad>"

In [9]:
# sequence length analysis
seq_length = [len(i) for i in corpus]
print(np.mean(seq_length), np.median(seq_length), np.min(seq_length), np.max(seq_length))

60.5432 55.0 2 237


In [10]:
from keras.preprocessing.sequence import pad_sequences
data = []
for i in corpus:
    seq = [t.word_index.get(j, 0) for j in i]
    pad_seq = pad_sequences([seq], maxlen=35, dtype='int32', padding='post', truncating='post', value=0.0)
    data.append(pad_seq)

In [11]:
cnt = 0
vocab_size = len(t.word_index)
embedding_matrix = np.zeros((vocab_size+1, 100))
for i in range(vocab_size+1):
    if idx2vocab[i] in embeddings_index:
        embedding_matrix[i] = embeddings_index[idx2vocab[i]]
    else:
        cnt += 1
print (cnt, 'vocab has no embedding', vocab_size)

6517 vocab has no embedding 19801


In [12]:
from keras.utils import to_categorical
data = np.array(data).reshape(17500, 35)

In [13]:
def data_generator(data):
    
    idx = np.arange(len(data))
    while True:
        np.random.shuffle(idx)
        batches = [idx[range(batch_size*i, min(len(data), batch_size*(i+1)))] for i in range(len(data)//batch_size+1)] 
        
        for i in batches:
            yield data[i], to_categorical(data[i], num_classes=vocab_size+1)

In [14]:
from keras.layers import Input, LSTM, RepeatVector, Embedding, Dense
from keras.models import Model

In [21]:
timesteps = 35
input_dim = 1
latent_dim = 128

embedding_layer = Embedding(len(t.word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=35,
                            trainable=False)

inputs = Input(shape=(timesteps,))
emb = embedding_layer(inputs)
encoded = LSTM(latent_dim)(emb)

decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(100, return_sequences=True)(decoded)
decoded = Dense(vocab_size+1, activation='softmax')(decoded)

autoencoder = Model(inputs, decoded)
encoder = Model(inputs, encoded)

In [22]:
autoencoder .summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 35)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 35, 100)           1980200   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               117248    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 35, 128)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 35, 100)           91600     
_________________________________________________________________
dense_1 (Dense)              (None, 35, 19802)         2000002   
Total params: 4,189,050
Trainable params: 2,208,850
Non-trainable params: 1,980,200
__________________________________________________________

In [23]:
from keras.optimizers import Adam
opt = Adam(lr=0.01, amsgrad=True, decay=0.005)
autoencoder.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [24]:
split = int(len(data)*0.95)
train, val = data[:split], data[split:]

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
batch_size = 32
history = autoencoder.fit_generator(data_generator(data), 
                                    steps_per_epoch=int((len(data)+batch_size-1)/batch_size), 
                                    validation_data=data_generator(val), 
                                    validation_steps=int((len(val)+batch_size-1)/batch_size),
                                    epochs=1000, callbacks=[EarlyStopping(monitor='val_loss', patience=15), 
                                                           ModelCheckpoint("autoencoder.hdf5", monitor='val_loss', save_best_only=True)])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500


Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500


Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500

In [None]:
autoencoder.save('autoencoder.h5')
encoder.save('encoder.h5')

In [None]:
with open('history.pkl', 'wb') as f:
    pickle.dump(history, f)