[machinelearningmastery](http://machinelearningmastery.com/understanding-stateful-lstm-recurrent-neural-networks-python-keras/)

In [1]:
# Naive LSTM to learn one-char to one-char mapping
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils

Using Theano backend.


In [2]:
# fix random seed for reproducibility
numpy.random.seed(7)
# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

# Naive LSTM for Learning One-Char to One-Char Mapping

In [3]:
# create mapping of characters to integers (0-25) and the reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

In [4]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 2
dataX = []
dataY = []
for i in range(0, len(alphabet) - seq_length, 1):
    seq_in = alphabet[i:i + seq_length]
    seq_out = alphabet[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
    print(seq_in, '->', seq_out)

AB -> C
BC -> D
CD -> E
DE -> F
EF -> G
FG -> H
GH -> I
HI -> J
IJ -> K
JK -> L
KL -> M
LM -> N
MN -> O
NO -> P
OP -> Q
PQ -> R
QR -> S
RS -> T
ST -> U
TU -> V
UV -> W
VW -> X
WX -> Y
XY -> Z


In [5]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (len(dataX), seq_length, 1))

In [6]:
dataX

[[0, 1],
 [1, 2],
 [2, 3],
 [3, 4],
 [4, 5],
 [5, 6],
 [6, 7],
 [7, 8],
 [8, 9],
 [9, 10],
 [10, 11],
 [11, 12],
 [12, 13],
 [13, 14],
 [14, 15],
 [15, 16],
 [16, 17],
 [17, 18],
 [18, 19],
 [19, 20],
 [20, 21],
 [21, 22],
 [22, 23],
 [23, 24]]

In [7]:
X

array([[[ 0],
        [ 1]],

       [[ 1],
        [ 2]],

       [[ 2],
        [ 3]],

       [[ 3],
        [ 4]],

       [[ 4],
        [ 5]],

       [[ 5],
        [ 6]],

       [[ 6],
        [ 7]],

       [[ 7],
        [ 8]],

       [[ 8],
        [ 9]],

       [[ 9],
        [10]],

       [[10],
        [11]],

       [[11],
        [12]],

       [[12],
        [13]],

       [[13],
        [14]],

       [[14],
        [15]],

       [[15],
        [16]],

       [[16],
        [17]],

       [[17],
        [18]],

       [[18],
        [19]],

       [[19],
        [20]],

       [[20],
        [21]],

       [[21],
        [22]],

       [[22],
        [23]],

       [[23],
        [24]]])

In [8]:
print('Shape dataX: ' + str(numpy.shape(dataX)))
print('Shape X: ' + str(numpy.shape(X)))

Shape dataX: (24, 2)
Shape X: (24, 2, 1)


In [9]:
# normalize
X = X / float(len(alphabet))

In [10]:
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [11]:
y

array([[ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [18]:
# create and fit the model
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, batch_size=1, verbose=2)

Epoch 1/500
0s - loss: 3.2699 - acc: 0.0417
Epoch 2/500
0s - loss: 3.2587 - acc: 0.0000e+00
Epoch 3/500
0s - loss: 3.2532 - acc: 0.0417
Epoch 4/500
0s - loss: 3.2481 - acc: 0.0417
Epoch 5/500
0s - loss: 3.2430 - acc: 0.0417
Epoch 6/500
0s - loss: 3.2376 - acc: 0.0417
Epoch 7/500
0s - loss: 3.2321 - acc: 0.0417
Epoch 8/500
0s - loss: 3.2264 - acc: 0.0417
Epoch 9/500
0s - loss: 3.2204 - acc: 0.0417
Epoch 10/500
0s - loss: 3.2143 - acc: 0.0417
Epoch 11/500
0s - loss: 3.2072 - acc: 0.0417
Epoch 12/500
0s - loss: 3.2006 - acc: 0.0417
Epoch 13/500
0s - loss: 3.1908 - acc: 0.0417
Epoch 14/500
0s - loss: 3.1822 - acc: 0.0417
Epoch 15/500
0s - loss: 3.1722 - acc: 0.0417
Epoch 16/500
0s - loss: 3.1622 - acc: 0.0417
Epoch 17/500
0s - loss: 3.1522 - acc: 0.0417
Epoch 18/500
0s - loss: 3.1385 - acc: 0.0417
Epoch 19/500
0s - loss: 3.1280 - acc: 0.0417
Epoch 20/500
0s - loss: 3.1151 - acc: 0.0417
Epoch 21/500
0s - loss: 3.1026 - acc: 0.0417
Epoch 22/500
0s - loss: 3.0885 - acc: 0.0417
Epoch 23/500
0s

<keras.callbacks.History at 0x1167cd898>

In [19]:
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 100.00%


In [21]:
# demonstrate some model predictions
for pattern in dataX:
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(seq_in, "->", result)

['A', 'B'] -> C
['B', 'C'] -> D
['C', 'D'] -> E
['D', 'E'] -> F
['E', 'F'] -> G
['F', 'G'] -> H
['G', 'H'] -> I
['H', 'I'] -> J
['I', 'J'] -> K
['J', 'K'] -> L
['K', 'L'] -> M
['L', 'M'] -> N
['M', 'N'] -> O
['N', 'O'] -> P
['O', 'P'] -> Q
['P', 'Q'] -> R
['Q', 'R'] -> S
['R', 'S'] -> T
['S', 'T'] -> U
['T', 'U'] -> V
['U', 'V'] -> W
['V', 'W'] -> X
['W', 'X'] -> Y
['X', 'Y'] -> Z


# Naive LSTM for a Three-Char Feature Window to One-Char Mapping

In [25]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
dataX = []
dataY = []
for i in range(0, len(alphabet) - seq_length, 1):
    seq_in = alphabet[i:i + seq_length]
    seq_out = alphabet[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
    print(seq_in, '->', seq_out)

ABC -> D
BCD -> E
CDE -> F
DEF -> G
EFG -> H
FGH -> I
GHI -> J
HIJ -> K
IJK -> L
JKL -> M
KLM -> N
LMN -> O
MNO -> P
NOP -> Q
OPQ -> R
PQR -> S
QRS -> T
RST -> U
STU -> V
TUV -> W
UVW -> X
VWX -> Y
WXY -> Z


In [26]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (len(dataX), 1, seq_length))

In [31]:
dataX

[[0, 1, 2],
 [1, 2, 3],
 [2, 3, 4],
 [3, 4, 5],
 [4, 5, 6],
 [5, 6, 7],
 [6, 7, 8],
 [7, 8, 9],
 [8, 9, 10],
 [9, 10, 11],
 [10, 11, 12],
 [11, 12, 13],
 [12, 13, 14],
 [13, 14, 15],
 [14, 15, 16],
 [15, 16, 17],
 [16, 17, 18],
 [17, 18, 19],
 [18, 19, 20],
 [19, 20, 21],
 [20, 21, 22],
 [21, 22, 23],
 [22, 23, 24]]

In [27]:
X

array([[[ 0,  1,  2]],

       [[ 1,  2,  3]],

       [[ 2,  3,  4]],

       [[ 3,  4,  5]],

       [[ 4,  5,  6]],

       [[ 5,  6,  7]],

       [[ 6,  7,  8]],

       [[ 7,  8,  9]],

       [[ 8,  9, 10]],

       [[ 9, 10, 11]],

       [[10, 11, 12]],

       [[11, 12, 13]],

       [[12, 13, 14]],

       [[13, 14, 15]],

       [[14, 15, 16]],

       [[15, 16, 17]],

       [[16, 17, 18]],

       [[17, 18, 19]],

       [[18, 19, 20]],

       [[19, 20, 21]],

       [[20, 21, 22]],

       [[21, 22, 23]],

       [[22, 23, 24]]])

In [32]:
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [33]:
# create and fit the model
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, batch_size=1, verbose=2)

Epoch 1/500
0s - loss: 3.2678 - acc: 0.0435
Epoch 2/500
0s - loss: 3.2555 - acc: 0.0435
Epoch 3/500
0s - loss: 3.2490 - acc: 0.0435
Epoch 4/500
0s - loss: 3.2424 - acc: 0.0435
Epoch 5/500
0s - loss: 3.2359 - acc: 0.0435
Epoch 6/500
0s - loss: 3.2284 - acc: 0.0435
Epoch 7/500
0s - loss: 3.2214 - acc: 0.0435
Epoch 8/500
0s - loss: 3.2146 - acc: 0.0435
Epoch 9/500
0s - loss: 3.2062 - acc: 0.0435
Epoch 10/500
0s - loss: 3.1978 - acc: 0.0435
Epoch 11/500
0s - loss: 3.1896 - acc: 0.0435
Epoch 12/500
0s - loss: 3.1799 - acc: 0.0435
Epoch 13/500
0s - loss: 3.1706 - acc: 0.0000e+00
Epoch 14/500
0s - loss: 3.1611 - acc: 0.0435
Epoch 15/500
0s - loss: 3.1504 - acc: 0.0000e+00
Epoch 16/500
0s - loss: 3.1403 - acc: 0.0435
Epoch 17/500
0s - loss: 3.1308 - acc: 0.0000e+00
Epoch 18/500
0s - loss: 3.1218 - acc: 0.0870
Epoch 19/500
0s - loss: 3.1112 - acc: 0.0435
Epoch 20/500
0s - loss: 3.1020 - acc: 0.0435
Epoch 21/500
0s - loss: 3.0937 - acc: 0.0435
Epoch 22/500
0s - loss: 3.0850 - acc: 0.0435
Epoch 2

<keras.callbacks.History at 0x119e10438>

In [34]:
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 86.96%


In [36]:
# demonstrate some model predictions
for pattern in dataX:
    x = numpy.reshape(pattern, (1, 1, len(pattern)))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(seq_in, "->", result)

['A', 'B', 'C'] -> D
['B', 'C', 'D'] -> E
['C', 'D', 'E'] -> F
['D', 'E', 'F'] -> G
['E', 'F', 'G'] -> H
['F', 'G', 'H'] -> I
['G', 'H', 'I'] -> J
['H', 'I', 'J'] -> K
['I', 'J', 'K'] -> L
['J', 'K', 'L'] -> M
['K', 'L', 'M'] -> N
['L', 'M', 'N'] -> O
['M', 'N', 'O'] -> P
['N', 'O', 'P'] -> Q
['O', 'P', 'Q'] -> R
['P', 'Q', 'R'] -> S
['Q', 'R', 'S'] -> T
['R', 'S', 'T'] -> U
['S', 'T', 'U'] -> V
['T', 'U', 'V'] -> X
['U', 'V', 'W'] -> Z
['V', 'W', 'X'] -> Z
['W', 'X', 'Y'] -> Z


# Naive LSTM for a Three-Char Time Step Window to One-Char Mapping

In [38]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 3
dataX = []
dataY = []
for i in range(0, len(alphabet) - seq_length, 1):
	seq_in = alphabet[i:i + seq_length]
	seq_out = alphabet[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
	print(seq_in, '->', seq_out)

ABC -> D
BCD -> E
CDE -> F
DEF -> G
EFG -> H
FGH -> I
GHI -> J
HIJ -> K
IJK -> L
JKL -> M
KLM -> N
LMN -> O
MNO -> P
NOP -> Q
OPQ -> R
PQR -> S
QRS -> T
RST -> U
STU -> V
TUV -> W
UVW -> X
VWX -> Y
WXY -> Z


In [39]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (len(dataX), seq_length, 1))
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [40]:
# create and fit the model
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, batch_size=1, verbose=2)

Epoch 1/500
0s - loss: 3.2654 - acc: 0.0000e+00
Epoch 2/500
0s - loss: 3.2507 - acc: 0.0000e+00
Epoch 3/500
0s - loss: 3.2421 - acc: 0.0000e+00
Epoch 4/500
0s - loss: 3.2342 - acc: 0.0435
Epoch 5/500
0s - loss: 3.2264 - acc: 0.0000e+00
Epoch 6/500
0s - loss: 3.2158 - acc: 0.0435
Epoch 7/500
0s - loss: 3.2062 - acc: 0.0000e+00
Epoch 8/500
0s - loss: 3.1949 - acc: 0.0435
Epoch 9/500
0s - loss: 3.1824 - acc: 0.0000e+00
Epoch 10/500
0s - loss: 3.1698 - acc: 0.0000e+00
Epoch 11/500
0s - loss: 3.1549 - acc: 0.0435
Epoch 12/500
0s - loss: 3.1372 - acc: 0.0435
Epoch 13/500
0s - loss: 3.1222 - acc: 0.0435
Epoch 14/500
0s - loss: 3.1031 - acc: 0.0435
Epoch 15/500
0s - loss: 3.0874 - acc: 0.0435
Epoch 16/500
0s - loss: 3.0698 - acc: 0.0435
Epoch 17/500
0s - loss: 3.0549 - acc: 0.0435
Epoch 18/500
0s - loss: 3.0372 - acc: 0.0435
Epoch 19/500
0s - loss: 3.0229 - acc: 0.0435
Epoch 20/500
0s - loss: 3.0051 - acc: 0.0435
Epoch 21/500
0s - loss: 2.9858 - acc: 0.0435
Epoch 22/500
0s - loss: 2.9687 - acc

<keras.callbacks.History at 0x11c71c4e0>

In [41]:
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 100.00%


In [43]:
# demonstrate some model predictions
for pattern in dataX:
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(len(alphabet))
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	print(seq_in, "->", result)

['A', 'B', 'C'] -> D
['B', 'C', 'D'] -> E
['C', 'D', 'E'] -> F
['D', 'E', 'F'] -> G
['E', 'F', 'G'] -> H
['F', 'G', 'H'] -> I
['G', 'H', 'I'] -> J
['H', 'I', 'J'] -> K
['I', 'J', 'K'] -> L
['J', 'K', 'L'] -> M
['K', 'L', 'M'] -> N
['L', 'M', 'N'] -> O
['M', 'N', 'O'] -> P
['N', 'O', 'P'] -> Q
['O', 'P', 'Q'] -> R
['P', 'Q', 'R'] -> S
['Q', 'R', 'S'] -> T
['R', 'S', 'T'] -> U
['S', 'T', 'U'] -> V
['T', 'U', 'V'] -> W
['U', 'V', 'W'] -> X
['V', 'W', 'X'] -> Y
['W', 'X', 'Y'] -> Z
