In [21]:
#importing libraries
import tensorflow
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import  np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
#load data
file=open("sample.txt").read()

In [23]:
file



In [24]:
#tokenization
def tokenize_words(input):
  input=input.lower()
  tokenizer=RegexpTokenizer(r'\w+')
  tokens=tokenizer.tokenize(input)
  filtered=filter(lambda token: token not in stopwords.words('english'),tokens)
  return "".join(filtered)
processed_inputs=tokenize_words(file)

In [25]:
print(processed_inputs)



In [26]:
#chars to numbers
chars=sorted(list(set(processed_inputs)))
chars_to_num=dict((c,i) for i,c in enumerate(chars))

In [27]:
print(chars)
print(chars_to_num)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25}


In [28]:
#check if words to chars or chars to numbers worked or not
input_len=len(processed_inputs)
vocab_len=len(chars)
print('Total no.of characters:',input_len)
print('Total vocab:',vocab_len)

Total no.of characters: 18645
Total vocab: 26


In [29]:
#sequence lenght
seq_len=100
x_data=[]
y_data=[]

In [30]:
# loop through the sequence
for i in range(0,input_len-seq_len,1):
  in_seq=processed_inputs[i:i+seq_len]
  out_seq=processed_inputs[i+seq_len]
  x_data.append([chars_to_num[char] for char in in_seq])
  y_data.append(chars_to_num[out_seq])
n_patterns=len(x_data)
print("Total patterns:",n_patterns)

Total patterns: 18545


In [31]:
# convert input sequence to array
x=numpy.reshape(x_data,(n_patterns,seq_len,1))
x=x/float(vocab_len)

In [32]:
x

array([[[0.69230769],
        [0.53846154],
        [0.5       ],
        ...,
        [0.5       ],
        [0.69230769],
        [0.15384615]],

       [[0.53846154],
        [0.5       ],
        [0.23076923],
        ...,
        [0.69230769],
        [0.15384615],
        [0.65384615]],

       [[0.5       ],
        [0.23076923],
        [0.30769231],
        ...,
        [0.15384615],
        [0.65384615],
        [0.84615385]],

       ...,

       [[0.73076923],
        [0.07692308],
        [0.76923077],
        ...,
        [0.53846154],
        [0.73076923],
        [0.26923077]],

       [[0.07692308],
        [0.76923077],
        [0.65384615],
        ...,
        [0.73076923],
        [0.26923077],
        [0.15384615]],

       [[0.76923077],
        [0.65384615],
        [0.30769231],
        ...,
        [0.26923077],
        [0.15384615],
        [0.65384615]]])

In [33]:
# one-hot encoding
y=np_utils.to_categorical(y_data)


In [34]:
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [35]:
# creating the model
model=Sequential()
model.add(LSTM(256, input_shape=(x.shape[1],x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))

In [36]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [37]:
# saving weights
filepath="model_weights_saved.hdf5"
checkpoint=ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks=[checkpoint]

In [39]:
# training the model
model.fit(x,y, epochs=3, batch_size=256, callbacks=desired_callbacks)

Epoch 1/3

Epoch 00001: loss improved from inf to 2.95006, saving model to model_weights_saved.hdf5
Epoch 2/3

Epoch 00002: loss improved from 2.95006 to 2.94312, saving model to model_weights_saved.hdf5
Epoch 3/3

Epoch 00003: loss improved from 2.94312 to 2.93983, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7faed2553c10>

In [40]:
# recompile the model with the saved weights
filename="model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [41]:
# output of the model back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [42]:
num_to_char

{0: 'a',
 1: 'b',
 2: 'c',
 3: 'd',
 4: 'e',
 5: 'f',
 6: 'g',
 7: 'h',
 8: 'i',
 9: 'j',
 10: 'k',
 11: 'l',
 12: 'm',
 13: 'n',
 14: 'o',
 15: 'p',
 16: 'q',
 17: 'r',
 18: 's',
 19: 't',
 20: 'u',
 21: 'v',
 22: 'w',
 23: 'x',
 24: 'y',
 25: 'z'}

In [43]:
# random seed to help generate
start=numpy.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print('Random seed:')
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random seed:
" roundflatsidearmslashweightbehindparryalmostlazybladestouchedsteelshatteredscreamechoedforestnightlo "


In [44]:
# generate the text
for i in range(1000):
  X=numpy.reshape(pattern,(1,len(pattern),1))
  X=X/float(vocab_len)
  prediction=model.predict(X,verbose=0)
  index=numpy.argmax(prediction)
  result=num_to_char[index]
  seq_in=[num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern=pattern[1:len(pattern)]
  

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

In [46]:
result

'e'

In [47]:
pattern

[4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4]