In [8]:
#dependencies
import numpy
import sys

from nltk.tokenize import RegexpTokenizer  #ntlk---> natural language toolkit.
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
#load data
file=open("Frankestein2.txt",encoding="utf8").read()  #encoding is used to avoid unicode decode error.

In [10]:
#tokenization is splitting stream of text into tokens such as words,phrases,..etc.
#it is actually simplifying data
#first we can standardize by making them lowercase.

#Tokenize
#Standardization
def tokenize_words(input):
    
    #lowercase everything to standardize it
    input=input.lower()
    
    #initialize tokenizer
    tokenizer= RegexpTokenizer(r'\w+') #to tokenize words.There are several others options like [A-Z]\w ,S..etc.
    
    #tokenize the text into tokens
    tokens=tokenizer.tokenize(input)
    
    #filter the stopwords using lambda function
    filtered=filter(lambda token:token not in stopwords.words('english'),tokens)
    
    
    return "".join(filtered)

#preprocess the input data ,to tokenize it.
processed_inputs=tokenize_words(file)
    

In [11]:
#neural network actually work with number
#we have to convert characters into numbers


#char to number
#we will sort the set of all characters that appear in out i/p text and then use enumerate to get numbers that
#represent the characters
#we will create dic containing charecters as keys and number that representing it as value.

chars=sorted(list(set(processed_inputs)))
char_to_num=dict((c,i) for i,c in enumerate(chars))

In [12]:
#check if words to char or char to num working?
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters:",input_len)
print("Total vocab: ",vocab_len)


Total number of characters: 71391
Total vocab:  40


In [13]:
print(chars)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'ê', 'ô']


In [14]:
#seq length
#An individual sequence is complete mapping of input characters as integers.
seq_length=100   #we need how long we need individual sequences.
x_data=[]
y_data=[]


In [15]:
# loop through the sequence
#here we are going through the entire list input and converting chars to numbers with a for loop.
#this will create a bunch of sequences where each sequence start with the next character in the i/p data.
#begenning with the first character.

for i in range(0,input_len-seq_length,1):
    
    #defining input and output sequences
    
    # input is the current character plus desired sequence length.
    
    in_seq=processed_inputs[i:i+seq_length]
    
    # out is the initial  character plus total sequence length
    
    out_seq=processed_inputs[i+seq_length]
    
    #converting list of characters into integers based on previous values and appending it into the lists
    
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append([char_to_num[out_seq]])

#check to see how many input sequences we have to deal with.

n_pattern=len(x_data)
print("Total patterns : ",n_pattern)

Total patterns :  71291


In [16]:
#convert input sequence into np array that our network can use
X=numpy.reshape(x_data,(n_pattern, seq_length,1))
X=X/float(vocab_len)

In [17]:
#one- hot encoding
y=np_utils.to_categorical(y_data)


# creating the sequential model

In [18]:
#dropout is used to avoid overfitting
model=Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))


In [19]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [20]:
#saving wieghts  bcz we want some time to train the model, so we will call this later.
filepath='model_weights_saved.hdf5'
checkpoint=ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [21]:
#fit model and let it train
model.fit(X,y,epochs=4,batch_size=254,callbacks=desired_callbacks)


Epoch 1/4

Epoch 00001: loss improved from inf to 2.95631, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.95631 to 2.91716, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91716 to 2.91178, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.91178 to 2.90925, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x214fb44e088>

In [22]:
#recompile the model with saved weights
filename='model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [23]:
#output of the model back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [40]:
#random seed to help generate
start=numpy.random.randint(0,len(x_data))
pattern=x_data[start]
print("Random seed: ")
print("\"",''.join([num_to_char[value] for value in pattern]), "\"")


Random seed: 
" eadharmperformedkindestactioncouldtowardsrealityillsurelynothingunboundedunremittingattentionsfriend "


In [39]:
#generate the text
for i in range(1000):
    x=numpy.reshape(pattern,(1,len(pattern),1))
    x=x/float(vocab_len)
    prediction=model.predict(x,verbose=0)
    index=numpy.argmax(prediction)
    result=num_to_char[index]
    seq_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee