### Text generation with LSTM networks 

#### Importing libraries

In [0]:
import string,re
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# mount the google drive to your Colab session
from google.colab import drive
dir="/content/gdrive/My Drive/Colab Notebooks/EIP2/"
#drive.mount('/content/gdrive')
!ls

# define the checkpoint
filepath = dir + "Weights/BigLSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

gdrive	sample_data


#### Reading the text file

In [0]:
filename = "wonderland.txt"
raw_text = open(dir + filename).read()
raw_text = raw_text.lower()
print("Length before cleaning", len(raw_text))

chars = sorted(list(set(raw_text)))
print(len(chars), chars)

Length before cleaning 144342
43 ['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', ':', ';', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [0]:
def clean_text(text, rgx_list):
    new_text = text
    for rgx in rgx_list:
        new_text,n = re.subn(rgx, ' ', new_text)
    new_text = ' '.join(word.strip(string.punctuation) for word in new_text.split())
    return new_text

#### Cleaning of text like punctuation removal

In [0]:
# Removed pattern '--' since it is interfering with punctuation removal. After this removing punctuations
filtered_text = clean_text(raw_text, ['--'])
#print(filtered_text.find("--")) 
print("Length after cleaning",len(filtered_text)) #, filtered_text)

# create mapping of unique chars to integers
chars = sorted(list(set(filtered_text)))
print("Unique characters in the text", len(chars), chars)

char_to_int = dict((c, i) for i, c in enumerate(chars))
first5pairs = {k: char_to_int[k] for k in list(char_to_int)[:5]}
print(char_to_int)

text_len = len(filtered_text)
vocab_len = len(chars)
print("Total Characters: ", text_len)
print("Total Vocab: ", vocab_len)

Length after cleaning 135120
Unique characters in the text 29 [' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{' ': 0, "'": 1, '-': 2, 'a': 3, 'b': 4, 'c': 5, 'd': 6, 'e': 7, 'f': 8, 'g': 9, 'h': 10, 'i': 11, 'j': 12, 'k': 13, 'l': 14, 'm': 15, 'n': 16, 'o': 17, 'p': 18, 'q': 19, 'r': 20, 's': 21, 't': 22, 'u': 23, 'v': 24, 'w': 25, 'x': 26, 'y': 27, 'z': 28}
Total Characters:  135120
Total Vocab:  29


#### Split up the cleaned text into sequences of 100 characters

In [0]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, text_len - seq_length, 1):
	seq_in = filtered_text[i:i + seq_length]
	seq_out = filtered_text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  135020


#### Transform the list of input sequences into a form expected by the LSTM network.

In [0]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
#print(X.shape, X[:1,:10,:])

# normalize
X = X / float(vocab_len)
print("X after normalize", X[:1,:10,:])
print()

# one hot encode the output variable
y = np_utils.to_categorical(dataY)
print("y : ",y.shape, y[:1,:])

X after normalize [[[0.17241379]
  [0.34482759]
  [0.10344828]
  [0.62068966]
  [0.75862069]
  [0.24137931]
  [0.68965517]
  [0.        ]
  [0.37931034]
  [0.        ]]]

y :  (135020, 29) [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]]


#### Dropout value was changed. Dropout to the input layer was not added 

In [0]:
def build_model():
  model = Sequential()
  model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
  model.add(Dropout(0.1))
  model.add(LSTM(256))
  #model.add(Dropout(0.2))
  model.add(Dense(y.shape[1], activation='softmax'))
  
  return model

#### Build and trained the model for 28 epochs and saved the weight file. But the log was lost.

In [0]:
epochs=30

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256))
#model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# define the checkpoint
filepath = dir + "BigLSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(X, y, epochs=epochs, batch_size=64, callbacks=[checkpoint])

W0725 02:52:09.124726 140497816528768 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0725 02:52:09.176335 140497816528768 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0725 02:52:09.185511 140497816528768 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0725 02:52:09.576927 140497816528768 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0725 02:52:09.590681 

KeyboardInterrupt: ignored

In [0]:
model = build_model()
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.load_weights(dir + "BigLSTM-28-0.8728.hdf5")
print("Loaded model from disk", model.summary())

W0725 03:11:12.111645 140451487528832 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0725 03:11:12.141695 140451487528832 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 29)                7453      
Total params: 796,957
Trainable params: 796,957
Non-trainable params: 0
_________________________________________________________________
Loaded model from disk None


#### Continued training from 29th to 35th epoch

In [0]:
# define the checkpoint
filepath = dir + "BigLSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(X, y, initial_epoch=28, epochs=35, batch_size=64, callbacks=[checkpoint])

W0725 03:13:25.210864 140451487528832 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 29/35

Epoch 00029: loss improved from inf to 0.86046, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-29-0.8605.hdf5
Epoch 30/35

Epoch 00030: loss improved from 0.86046 to 0.85182, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-30-0.8518.hdf5
Epoch 31/35

Epoch 00031: loss improved from 0.85182 to 0.83791, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-31-0.8379.hdf5
Epoch 32/35

Epoch 00032: loss improved from 0.83791 to 0.83451, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-32-0.8345.hdf5
Epoch 33/35

Epoch 00033: loss improved from 0.83451 to 0.82383, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-33-0.8238.hdf5
Epoch 34/35

Epoch 00034: loss improved from 0.82383 to 0.81678, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-34-0.8168.hdf5
Epoch 35/35

Epoch 00035: loss improved from 0.81678 to 0.80974, saving model to /content/gdrive/My Driv

<keras.callbacks.History at 0x7fbd1a664908>

#### Continued training from 36th to 50th epoch. From 50th to 60th epoch, there was not much change in loss. Hence took the 50th epoch model weights.

In [0]:
# define the checkpoint
filepath = dir + "BigLSTM-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
model.fit(X, y, initial_epoch=35, epochs=50, batch_size=64, callbacks=[checkpoint])

Epoch 36/50

Epoch 00036: loss improved from inf to 0.80710, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-36-0.8071.hdf5
Epoch 37/50

Epoch 00037: loss improved from 0.80710 to 0.79912, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-37-0.7991.hdf5
Epoch 38/50

Epoch 00038: loss improved from 0.79912 to 0.79304, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-38-0.7930.hdf5
Epoch 39/50

Epoch 00039: loss improved from 0.79304 to 0.79178, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-39-0.7918.hdf5
Epoch 40/50

Epoch 00040: loss improved from 0.79178 to 0.78230, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-40-0.7823.hdf5
Epoch 41/50

Epoch 00041: loss improved from 0.78230 to 0.77445, saving model to /content/gdrive/My Drive/Colab Notebooks/EIP2/BigLSTM-41-0.7744.hdf5
Epoch 42/50

Epoch 00042: loss did not improve from 0.77445
Epoch 43/50

Epoch 00043: loss improved from

<keras.callbacks.History at 0x7fbd1030dfd0>

In [0]:
model = build_model()
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.load_weights(dir + "Weights/BigLSTM-50-0.7455.hdf5")
print("Loaded model from disk", model.summary())

W0726 05:45:06.860924 140717963208576 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0726 05:45:06.888891 140717963208576 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 05:45:06.892722 140717963208576 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0726 05:45:07.270133 140717963208576 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0726 05:45:07.283771 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 256)          264192    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 256)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_1 (Dense)              (None, 29)                7453      
Total params: 796,957
Trainable params: 796,957
Non-trainable params: 0
_________________________________________________________________
Loaded model from disk None


#### Picking a random input pattern of 100 chars as our seed sequence

In [0]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print(start, len(pattern))
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

105810 100
Seed:
" out lobsters you know which shall sing oh you sing said the gryphon i've forgotten the words so they "


In [0]:
output = ""

# generate characters
for i in range(500):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(vocab_len)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result_char = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  #sys.stdout.write(result)
  output += result_char
  
  #Predicted char is added at the end
  pattern.append(index)
#   print("Pattern1",''.join([int_to_char[value] for value in pattern]))
  
  #Pattern slides by 1 char
  pattern = pattern[1:len(pattern)]
  #print("Pattern2",''.join([int_to_char[value] for value in pattern]))
  
print("\nOutput:", output)


Output:  began rooething more haspily and drew dlnnent i don't like the look of the court and she was quite silent and looked at alice and she was now and she was now and she was now and she was now and she was now and the shar's pet i dould not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not could not co


Here, the first word was correct and the overall effect is ok. But after some 25-30 words, there are repetitive phrases. Predicting less, like 200 char might give better result.

Due to lack of time, could not train the model on padded sentences.