In [54]:
#import dependencies
import numpy as np
import sys

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras import layers
from keras.utils import np_utils
from tensorflow.keras.callbacks import ModelCheckpoint
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#loading the data
#Project gutenberg/burg is where the data can be found
file=open('frankenstein-2.txt').read()

In [3]:
#tokenization-is essentially breaking down a stram of text into words,phrases or symbols or other such meaningfull elements called tokens.
#standardization
#tokenization is the process of breaking a stram of text up into words phrases symbols or other meaningful elements
def tokenize_words(input):
    #lowercase everything to standarize it
    input=input.lower()
    #initializing the tokenizer
    tokenizer=RegexpTokenizer(r'\w+')
    #tokenizing the text into tokens
    
    tokens=tokenizer.tokenize(input)
    #filtering the stopwords using lambda
    filtered=filter(lambda token:token not in stopwords.words('english'),tokens)
    return " ".join(filtered)
processed_inputs=tokenize_words(file)

In [4]:
#chars to numbers
chars=sorted(list(set(processed_inputs)))
char_to_num=dict((c,i) for i,c in enumerate(chars))

In [5]:
#check if words to chars to num(?!) has worked?
input_len=len(processed_inputs)
vocab_len=len(chars)
print("total number of characters",input_len)
print("total vocab:",vocab_len)


total number of characters 269995
total vocab: 43


In [37]:
#seq length
seq_length=100
x_data=[]
y_data=[]


In [38]:
#loop through the sequence
for i in range(0,input_len - seq_length,1):
    in_seq=processed_inputs[i:i+seq_length]
    out_seq=processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns=len(x_data)
print("total patterns",n_patterns)

total patterns 269895


In [39]:
X=np.reshape(x_data,(n_patterns,seq_length,1))
X=X/float(vocab_len)
print(X.shape())

269895


In [40]:
#one hot encoding
y=np_utils.to_categorical(y_data)
print(len(y))

269895


In [55]:
#creating the model
model=Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))


In [56]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [57]:
#saving weights
filepath="mode_weighs_saved.hdf5"
checkpoint=ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [58]:
#fit model and let it train
model.fit(X,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4
Epoch 00001: loss improved from inf to 2.92002, saving model to mode_weighs_saved.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.92002 to 2.63759, saving model to mode_weighs_saved.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.63759 to 2.48719, saving model to mode_weighs_saved.hdf5
Epoch 4/4
Epoch 00004: loss improved from 2.48719 to 2.36350, saving model to mode_weighs_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f7117bb37b8>

In [59]:
#recompile the model with same weights
filename='mode_weighs_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')


In [60]:
##output of the model back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [61]:
#random seed to help generate
start=np.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print('random seed: ')
print('\'',''.join([num_to_char[value] for value in pattern]),'\'')


random seed: 
' y increase miseries tenfold obstacle wishes ah victor assured cousin playmate sincere love made mise '


In [62]:
#generate the text
for i in range(1000):
    x=np.reshape(pattern,(1,len(pattern),1))
    x=x/float(vocab_len)
    prediction=model.predict(x,verbose=0)
    index=np.argmax(prediction)
    result=num_to_char[index]
    seq_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]
    

r seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare seare se