In [19]:
import os
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint
import random

import matplotlib.pyplot as plt



In [20]:
#Read all Text of Books 
print('Number of books used to create corpus: ', 96)
books_combined = open('books_combined.txt').read()
print('Number of characters in corpus: ', len(books_combined))

unique_words = len(set(books_combined.split(' ')))
print('Number of unique words: ', unique_words)

Number of books used to create corpus:  96
Number of characters in corpus:  74454677
Number of unique words:  343646


In [21]:
books_combined[5000:6000]

'ven and a half pounds since i saw you. seven! i answered. indeed, i should have thought a little more. just a trifle more, i fancy, watson. and in practice again, i observe. you did not tell me that you intended to go into harness. then, how do you know? i see it, i deduce it. how do i know that you have been getting yourself very wet lately, and that you have a most clumsy and careless servant girl? my dear holmes, said i, this is too much. you would certainly have been burned, had you lived a few centuries ago. it is true that i had a country walk on thursday and came home in a dreadful mess, but as i have changed my clothes i cant imagine how you deduce it. as to mary jane, she is incorrigible, and my wife has given her notice, but there, again, i fail to see how you work it out. he chuckled to himself and rubbed his long, nervous hands together. it is simplicity itself, said he; my eyes tell me that on the inside of your left shoe, just where the firelight strikes it, the leather 

In [32]:
#data is too big, kernel can't handle, get a subset
start_book_ind = 20000000
end_book_ind = 25000000
sub_books_combined = books_combined[start_book_ind: end_book_ind]

char_to_int = {}
int_to_char = {}
#turn characters into integers and integers into characters - chars should be sorted to keep consistency
for i, char in enumerate(sorted(set(sub_books_combined))):
    char_to_int[char] = i
    int_to_char[i] = char
    
print('number of unique characters: ', len(char_to_int)) #should be 48 every time
print('The unique characters within the text: ', list(char_to_int))

number of unique characters:  48
The unique characters within the text:  [' ', '!', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [33]:
#returns a tensor for training
def get_train(book, sentence_length=50):
    sent = []
    next_c = []
    #get varying sentences of similar sizes - step for a size of 5 - 5 is an arbitraily chosen number
    for i in range(0, len(book)-sentence_length, 2):
        begin_char = i
        end_char = i+sentence_length
        #combine sentence to a sentence list
        sent.append(book[begin_char: end_char])
        #get the next character after the sentence
        next_c.append(book[end_char])

    print(sent[:5])
    print(next_c[:5])

    #turn into Tensors for training the model - using one hot encoding method
    # 1 if char in sentence and 0 otherwise
    unique_c_length = len(set(book)) # number of unique characters in text
    num_sent = (len(sent)) #numer of sentences created
    x = np.zeros((num_sent, sentence_length, unique_c_length), dtype=np.int8)
    y = np.zeros((num_sent, unique_c_length), dtype=np.int8)

    for i, sentence in enumerate(sent):
        for k, c in enumerate(sentence):
            c_ind = char_to_int[c]
            x[i, k, c_ind] = 1

        next_c_ind = char_to_int[next_c[i]]
        y[i, next_c_ind] = 1


    return x,y

In [34]:
#set the length of each sentence
sentence_length = 50
num_unique_chars = len(set(sub_books_combined))
print("sentence_length: ", sentence_length)
print("num unique_chars: ", num_unique_chars)

#create tensors
x, y = get_train(sub_books_combined, sentence_length=sentence_length)
print(x.shape)
print(y.shape)

sentence_length:  50
num unique_chars:  48
['force down the general throat like a bolus, always', 'rce down the general throat like a bolus, always t', 'e down the general throat like a bolus, always to ', 'down the general throat like a bolus, always to be', 'wn the general throat like a bolus, always to be h']
[' ', 'o', 'b', ' ', 'e']
(2499975, 50, 48)
(2499975, 48)


In [35]:
sub_books_combined[5000:6000]

'e same principles, like so many pianoforte legs. he had been put through an immense variety of paces, and had answered volumes of head-breaking questions. orthography, etymology, syntax, and prosody, biography, astronomy, geography, and general cosmography, the sciences of compound proportion, algebra, land-surveying and levelling, vocal music, and drawing from models, were all at the ends of his ten chilled fingers. he had worked his stony way into her majestys most honourable privy councils schedule b, and had taken the bloom off the higher branches of mathematics and physical science, french, german, latin, and greek. he knew all about all the water sheds of all the world (whatever they are), and all the histories of all the peoples, and all the names of all the rivers and mountains, and all the productions, manners, and customs of all the countries, and all their boundaries and bearings on the two and thirty points of the compass. ah, rather overdone, mchoakumchild. if he had only

In [None]:
#save models that perform best
filepath="weights-improvement-{epoch:02d}-{loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

#create network
model = Sequential()
#128 nodes are used b/c the current network is small
model.add(LSTM(128, input_shape=(sentence_length, num_unique_chars)))
model.add(Dropout(0.2))
# model.add(LSTM(128))
# model.add(Dropout(0.2))
#add the output layer
model.add(Dense(num_unique_chars, activation='softmax'))
#load previous weights
weights_filename = 'models/round4/model_4.h5'
model.load_weights(weights_filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit(x, y, epochs=50, batch_size=256, callbacks=[checkpoint])

In [None]:
#creates new sentence
def create_passage(model):
    #complete passage string
    gen_sentence = ''

    #create sentence
    start_ind = random.randint(0, len(sub_books_combined) - sentence_length - 1)
    sentence = sub_books_combined[start_ind: start_ind+sentence_length]
    gen_sentence += sentence
    print('original sentenc: ', sentence)

    for i in range(500):
    #turn sentence into model format
        x_pred = np.zeros((1, sentence_length, len(char_to_int)))
        for k, c in enumerate(sentence):
            x_ind = char_to_int[c]
            x_pred[0, k, x_ind] = 1

        #predict next character - returns predicted probabilities
        prob_c = model.predict(x_pred, verbose=0)[0]
        #turn to float64 - mulitnomial gives error otherwise
        prob_c = np.asarray(prob_c).astype('float64')
        #     print(prob_c)
        #sample from the probability 
        log_prob = np.log(prob_c) / 0.5
        #     print(log_prob)
        exp_prob = np.exp(log_prob)
        #     print(exp_prob)
        pred_prob = exp_prob/np.sum(exp_prob)
        #     print(pred_prob)
        p = np.random.multinomial(1, pred_prob, 1) 
        #     print(p)
        prob_ind = np.argmax(p)
        #     print(prob_ind)




        #turn int to character
        next_c = int_to_char[prob_ind]
        #add character to generated sentence
        gen_sentence += next_c
        #     print(gen_sentence)
        #get new sentence by sliding to index of sentence
        sentence = sentence[1:]+next_c


    return gen_sentence

In [None]:
#test out creating a passage using some of the passage within the books
print(create_passage(model))

original sentenc: sly stirred us to glory and gave me these<br>
sly stirred us to glory and gave me these of the sense of the strong and being in a state and the street the soul for the spoken to scarne the fire destroyed and with the subjects best notice and absolute stream by the death of a contrary of the concerning the supportions is been ever known to the interest of the carriage in the man project gutenberg-tm electronic works in a most common-wealth and one of one final and the more not one of the wants of the consequence where the present stood the other were the small and project gutenberg

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model_4_architecture.json", "w+") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_4.h5")
print("Saved model to disk")

Saved model to disk

In [None]:
#get loss 
history_dict = model.history.history

In [None]:
#plot the loss function
plt.plot(range(38), history_dict['loss'])
plt.title('Training Data Loss Function')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.show()