In [None]:
https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/
#X,									y
#_, _, _, _, _, Jack, 				and
#_, _, _, _, Jack, and 				Jill
#_, _, _, Jack, and, Jill,			went
#_, _, Jack, and, Jill, went,		up
#_, Jack, and, Jill, went, up,		the
#Jack, and, Jill, went, up, the,		hill

In [4]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [5]:
# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """

In [6]:
# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [7]:
# create line-based sequences
sequences = list()
for line in data.split('\n'):
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 21


In [8]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 7


In [9]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [10]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 6, 10)             220       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
 - 1s - loss: 3.0906 - acc: 0.0476
Epoch 2/500
 - 0s - loss: 3.0890 - acc: 0.1429
Epoch 3/500
 - 0s - loss: 3.0875 - acc: 0.1429
Epoch 4/500
 - 0s - loss: 3.0859 - acc: 0.1905
Epoch 5/500
 - 0s - loss: 3.0843 - acc: 0.1905
Epoch 6/500
 - 0s - loss: 3.0827 - acc: 0.1429
Epoch 7/500
 - 0s - loss: 3.0811 - acc: 0.1429
Epoch 8/500
 - 0s - loss: 3.0794 - acc: 0.1429
Epoch 9/500
 - 0s - loss: 3.0776 - acc: 0.1429
Epoch 10/500
 - 0s - loss: 3.0758 - acc: 0.1429
Epoch 11/500
 - 0s - loss: 3.0739 - acc: 0.1429
Epoch 12/500
 - 0s - loss: 3.0720 - acc: 0.1429
Epoch 13/500
 - 0s - loss: 3.0699 - acc: 0.1429
Epoch 14/500
 - 0s - loss: 3.0678 - acc: 0.1429
Epoch 15/500
 - 0s - loss: 3.0655 - acc: 0.1429
Epoch 16/500
 - 0s - loss: 3.0631 - acc: 0.1429
Epoch 17/500
 - 0s - loss: 3.0606 - acc: 0.1429
Epoch 18/500
 - 0s - loss: 3.0579 - acc: 0.1429
Epoch 19/500
 - 0s - loss: 3.0550 - acc: 0.1429
Epoch 20/500
 - 0s - loss: 3.0520 - acc: 0.1429
Epoch 21/500
 - 0s - loss: 3.0488 - acc: 0.1429
E

Epoch 171/500
 - 0s - loss: 0.6239 - acc: 0.8571
Epoch 172/500
 - 0s - loss: 0.6167 - acc: 0.8571
Epoch 173/500
 - 0s - loss: 0.6097 - acc: 0.8571
Epoch 174/500
 - 0s - loss: 0.6028 - acc: 0.8571
Epoch 175/500
 - 0s - loss: 0.5960 - acc: 0.8571
Epoch 176/500
 - 0s - loss: 0.5893 - acc: 0.8571
Epoch 177/500
 - 0s - loss: 0.5828 - acc: 0.8571
Epoch 178/500
 - 0s - loss: 0.5764 - acc: 0.8571
Epoch 179/500
 - 0s - loss: 0.5701 - acc: 0.8571
Epoch 180/500
 - 0s - loss: 0.5639 - acc: 0.8571
Epoch 181/500
 - 0s - loss: 0.5578 - acc: 0.8571
Epoch 182/500
 - 0s - loss: 0.5518 - acc: 0.8571
Epoch 183/500
 - 0s - loss: 0.5459 - acc: 0.8571
Epoch 184/500
 - 0s - loss: 0.5400 - acc: 0.8571
Epoch 185/500
 - 0s - loss: 0.5343 - acc: 0.8571
Epoch 186/500
 - 0s - loss: 0.5286 - acc: 0.8571
Epoch 187/500
 - 0s - loss: 0.5230 - acc: 0.8571
Epoch 188/500
 - 0s - loss: 0.5175 - acc: 0.8571
Epoch 189/500
 - 0s - loss: 0.5121 - acc: 0.8571
Epoch 190/500
 - 0s - loss: 0.5068 - acc: 0.8571
Epoch 191/500
 - 0s 

Epoch 339/500
 - 0s - loss: 0.1703 - acc: 0.9524
Epoch 340/500
 - 0s - loss: 0.1694 - acc: 0.9524
Epoch 341/500
 - 0s - loss: 0.1686 - acc: 0.9524
Epoch 342/500
 - 0s - loss: 0.1678 - acc: 0.9524
Epoch 343/500
 - 0s - loss: 0.1668 - acc: 0.9524
Epoch 344/500
 - 0s - loss: 0.1661 - acc: 0.9524
Epoch 345/500
 - 0s - loss: 0.1653 - acc: 0.9524
Epoch 346/500
 - 0s - loss: 0.1643 - acc: 0.9524
Epoch 347/500
 - 0s - loss: 0.1635 - acc: 0.9524
Epoch 348/500
 - 0s - loss: 0.1628 - acc: 0.9524
Epoch 349/500
 - 0s - loss: 0.1620 - acc: 0.9524
Epoch 350/500
 - 0s - loss: 0.1611 - acc: 0.9524
Epoch 351/500
 - 0s - loss: 0.1604 - acc: 0.9524
Epoch 352/500
 - 0s - loss: 0.1596 - acc: 0.9524
Epoch 353/500
 - 0s - loss: 0.1587 - acc: 0.9524
Epoch 354/500
 - 0s - loss: 0.1582 - acc: 0.9524
Epoch 355/500
 - 0s - loss: 0.1573 - acc: 0.9524
Epoch 356/500
 - 0s - loss: 0.1564 - acc: 0.9524
Epoch 357/500
 - 0s - loss: 0.1558 - acc: 0.9524
Epoch 358/500
 - 0s - loss: 0.1550 - acc: 0.9524
Epoch 359/500
 - 0s 

<keras.callbacks.History at 0x27f34f70898>

In [13]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [14]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack', 4))
print(generate_seq(model, tokenizer, max_length-1, 'Jill', 4))

Jack fell down and broke
Jill jill came tumbling after
