In [3]:
import numpy as np
import gensim
import string

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.utils.data_utils import get_file


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
path="/home/junjie/.keras/datasets/arxiv_abstracts.txt"

In [5]:
print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
  docs = file_.readlines()
sentences = [[word for word in doc.lower().translate( string.punctuation).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))


Preparing the sentences...
Num sentences: 7200


In [6]:
sentences[0]

['in',
 'science',
 'and',
 'engineering,',
 'intelligent',
 'processing',
 'of',
 'complex',
 'signals',
 'such',
 'as',
 'images,',
 'sound',
 'or',
 'language',
 'is',
 'often',
 'performed',
 'by',
 'a',
 'parameterized',
 'hierarchy',
 'of',
 'nonlinear',
 'processing',
 'layers,',
 'sometimes',
 'biologically',
 'inspired.',
 'hierarchical',
 'systems',
 '(or,',
 'more',
 'generally,',
 'nested',
 'systems)',
 'offer',
 'a',
 'way',
 'to']

In [8]:
print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['model', 'network', 'train', 'learn']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))


Training word2vec...
Result embedding shape: (1351, 100)
Checking similar words:
  model -> $l_p$ (0.40), technique (0.34), architecture. (0.33), trains (0.32), continuous (0.32), 2012) (0.31), framework (0.31), studying (0.31)
  network -> networks (0.37), connected (0.25), constrained (0.24), represent (0.22), lies (0.22), research (0.22), architecture (0.22), subclass (0.21)
  train -> based (0.36), eigendecompositions (0.34), average (0.34), construct (0.33), directly (0.33), represent (0.33), classical (0.33), tend (0.32)
  learn -> lower (0.40), automatically (0.39), adopted (0.36), relevant (0.36), remain (0.36), effort (0.36), upper (0.35), tend (0.35)


  This is separate from the ipykernel package so we can avoid doing imports until
  


In [9]:
def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

In [10]:
print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)


Preparing the data for LSTM...


In [11]:
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])

In [12]:
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

train_x shape: (7200, 40)
train_y shape: (7200,)


In [13]:
for i, sentence in enumerate(train_x[1]):
    print(idx2word(sentence))

poor
(even
random)
starting
points
for
learning/training/optimization
are
common
in
machine
learning.
in
many
settings,
the
method
of
robbins
and
monro
(online
stochastic
gradient
descent)
is
known
to
be
optimal
for
good
starting
points,
but
may
not
be
optimal
the


In [14]:
train_y[0]

3

In [15]:
idx2word(0)

'the'

In [16]:
sentences[1]

['poor',
 '(even',
 'random)',
 'starting',
 'points',
 'for',
 'learning/training/optimization',
 'are',
 'common',
 'in',
 'machine',
 'learning.',
 'in',
 'many',
 'settings,',
 'the',
 'method',
 'of',
 'robbins',
 'and',
 'monro',
 '(online',
 'stochastic',
 'gradient',
 'descent)',
 'is',
 'known',
 'to',
 'be',
 'optimal',
 'for',
 'good',
 'starting',
 'points,',
 'but',
 'may',
 'not',
 'be',
 'optimal',
 'for']

In [17]:
train_x[0]

array([   4,  844,    5, 1021,  860,  163,    1,  107,  141,   26,   19,
        192, 1140,   46,  412,   11,  127,  281,   17,    2,  869,  285,
          1,  224,  163,  402, 1219,  495,  736,  178, 1291,  827,   53,
       1065, 1249, 1100,  823,    2,  282,    0], dtype=int32)

In [19]:
train_x.shape

(7200, 40)

In [8]:
print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


Training LSTM...


In [9]:
def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

def generate_next(text, num_generated=10):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  for i in range(num_generated):
    prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=0.7)
    word_idxs.append(idx)
  return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
  print('\nGenerating text after epoch: %d' % epoch)
  texts = [
    'deep convolutional',
    'simple and effective',
    'a nonconvex',
    'a',
  ]
  for text in texts:
    sample = generate_next(text)
    print('%s... -> %s' % (text, sample))


In [10]:
model.fit(train_x, train_y,
          batch_size=128,
          epochs=20,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])

Epoch 1/20

Generating text after epoch: 0
deep convolutional... -> deep convolutional accurately adaptation representation visual descent) (asr). residual lower-bounded about customer
simple and effective... -> simple and effective defined presented. non-linear, consider (sgd); networks, rates low-dimensional formulation contrast
a nonconvex... -> a nonconvex depth-dependency paper, contrast process tasks. child remarkable presents not see
a... -> a on examples clarifies supporting them. leveraging "maxout" directly (mdrnns) spectrogram
Epoch 2/20

Generating text after epoch: 1
deep convolutional... -> deep convolutional whereas are: automatic krizhevsky analysis outperform believed promising success gprop,
simple and effective... -> simple and effective starting plays loss depth, position, limiting long machines operate (mtf).
a nonconvex... -> a nonconvex (rnns) appropriate evolving nearly periodically markov available objective consider able
a... -> a decay unknown with extract te


Generating text after epoch: 14
deep convolutional... -> deep convolutional contrast, networks.+ vector providing computation variation spurious combine backpropagation problem,
simple and effective... -> simple and effective technique, optima humans there domain deeper edge loss also (possibly
a nonconvex... -> a nonconvex estimated hierarchical unit visual minima parameters, neural networks.+ architecture, variants
a... -> a normalized versatility model prone basic rise rise rounding building variant
Epoch 16/20

Generating text after epoch: 15
deep convolutional... -> deep convolutional open (rir): randomly same lower-bounded random notion outperform discrete neuron
simple and effective... -> simple and effective domain capacity, study (iii) research supporting deep labeled sequentially or
a nonconvex... -> a nonconvex labels incorporating (rnn) arbitrary three redundancy characterization versatility computationally effort
a... -> a as (e.g rectified random) requires effective desi

<keras.callbacks.History at 0x7f2b2c0cda20>