In [1]:
import sys, argparse, io, re, numpy, keras
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku

In [2]:
def parse_args():
    parser = argparse.ArgumentParser(description="Gets command line inputs")
    parser.add_argument("--epochs", type=int, default=25, help="number of training epochs")
    return parser.parse_args()

In [3]:
def get_data():
    path = keras.utils.get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')

    with io.open(path, encoding='utf-8') as file:
      raw_text = file.read()

    # raw_text = """SUPPOSING that Truth is a woman--what then? Is there not ground for suspecting that all philosophers, in so far as they have been dogmatists, have failed to understand women--that the terrible seriousness and clumsy importunity with which they have usually paid their addresses to Truth, have been unskilled and unseemly methods for winning a woman? Certainly she has never allowed herself to be won; and at present every kind of dogma stands with sad and discouraged mien--IF, indeed, it stands at all! For there are scoffers who maintain that it has fallen, that all dogma lies on the ground--nay more, that it is at its last gasp. But to speak seriously, there are good grounds for hoping that all dogmatizing in philosophy, whatever solemn, whatever conclusive and decided airs it has assumed, may have been only a noble puerilism and tyronism; and probably the time is at hand when it will be once and again understood WHAT has actually sufficed for the basis of such imposing and absolute philosophical edifices as the dogmatists have hitherto reared: perhaps some popular superstition of immemorial time (such as the soul-superstition, which, in the form of subject- and ego-superstition, has not yet ceased doing mischief): perhaps some play upon words, a deception on the part of grammar, or an audacious generalization of very restricted, very personal, very human--all-too-human facts. The philosophy of the dogmatists, it is to be hoped, was only a promise for thousands of years afterwards, as was astrology in still earlier times, in the service of which probably more labour, gold, acuteness, and patience have been spent than on any actual science hitherto: we owe to it, and to its "super-terrestrial" pretensions in Asia and Egypt, the grand style of architecture. It seems that in order to inscribe themselves upon the heart of humanity with everlasting claims, all great things have first to wander about the earth as enormous and awe-inspiring caricatures: dogmatic philosophy has been a caricature of this kind--for instance, the Vedanta doctrine in Asia, and Platonism in Europe. Let us not be ungrateful to it, although it must certainly be confessed that the worst, the most tiresome, and the most dangerous of errors hitherto has been a dogmatist error--namely, Plato's invention of Pure Spirit and the Good in Itself. But now when it has been surmounted, when Europe, rid of this nightmare, can again draw breath freely and at least enjoy a healthier--sleep, we, WHOSE DUTY IS WAKEFULNESS ITSELF, are the heirs of all the strength which the struggle against this error has fostered. It amounted to the very inversion of truth, and the denial of the PERSPECTIVE--the fundamental condition--of life, to speak of Spirit and the Good as Plato spoke of them; indeed one might ask, as a physician: "How did such a malady attack that finest product of antiquity, Plato? Had the wicked Socrates really corrupted him? Was Socrates after all a corrupter of youths, and deserved his hemlock?" But the struggle against Plato, or--to speak plainer, and for the "people"--the struggle against the ecclesiastical oppression of millenniums of Christianity (FOR CHRISTIANITY IS PLATONISM FOR THE "PEOPLE"), produced in Europe a magnificent tension of soul, such as had not existed anywhere previously; with such a tensely strained bow one can now aim at the furthest goals. As a matter of fact, the European feels this tension as a state of distress, and twice attempts have been made in grand style to unbend the bow: once by means of Jesuitism, and the second time by means of democratic enlightenment--which, with the aid of liberty of the press and newspaper-reading, might, in fact, bring it about that the spirit would not so easily find itself in "distress"! (The Germans invented gunpowder--all credit to them! but they again made things square--they invented printing.) But we, who are neither Jesuits, nor democrats, nor even sufficiently Germans, we GOOD EUROPEANS, and free, VERY free spirits--we have it still, all the distress of spirit and all the tension of its bow! And perhaps also the arrow, the duty, and, who knows? THE GOAL TO AIM AT...."""
    
    processed_text = raw_text.lower()

    # TODO
    # for punct in string.punctuation:
    #     text = text.replace(punct, ' '+punct+' ')

    processed_text = processed_text.replace('\n', ' ').replace('\r', '')
    processed_text = re.sub(r'[\d=_]', r'', processed_text)
    processed_text = re.sub(r'(?=[",;:()])(?<=[^\s])', r' ', processed_text) # adding spaces around punctuations
    processed_text = re.sub(r'(?<=[",;:()])(?=[^\s])', r' ', processed_text) # adding spaces around punctuations
    processed_text = re.sub(r'(?<=[^\s])(--)', r' --', processed_text) # adding spaces around punctuations
    processed_text = re.sub(r'(--)(?=[^\s])', r'-- ', processed_text) # adding spaces around punctuations
    # print(processed_text)
    return processed_text

data = get_data()
print(data[0:10000])

Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
preface   supposing that truth is a woman -- what then? is there not ground for suspecting that all philosophers , in so far as they have been dogmatists , have failed to understand women -- that the terrible seriousness and clumsy importunity with which they have usually paid their addresses to truth , have been unskilled and unseemly methods for winning a woman? certainly she has never allowed herself to be won ; and at present every kind of dogma stands with sad and discouraged mien -- if , indeed , it stands at all! for there are scoffers who maintain that it has fallen , that all dogma lies on the ground -- nay more , that it is at its last gasp. but to speak seriously , there are good grounds for hoping that all dogmatizing in philosophy , whatever solemn , whatever conclusive and decided airs it has assumed , may have been only a noble puerilism and tyronism ; and probably the time is at hand when it will

In [4]:
sentences = re.split('[?!.]', data)
print("Sentences: ")
for i in range(20):
    print(sentences[i])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
number_unique_words = len(tokenizer.word_index) + 1

print("Number of unique words: ", number_unique_words)

Sentences: 
preface   supposing that truth is a woman -- what then
 is there not ground for suspecting that all philosophers , in so far as they have been dogmatists , have failed to understand women -- that the terrible seriousness and clumsy importunity with which they have usually paid their addresses to truth , have been unskilled and unseemly methods for winning a woman
 certainly she has never allowed herself to be won ; and at present every kind of dogma stands with sad and discouraged mien -- if , indeed , it stands at all
 for there are scoffers who maintain that it has fallen , that all dogma lies on the ground -- nay more , that it is at its last gasp
 but to speak seriously , there are good grounds for hoping that all dogmatizing in philosophy , whatever solemn , whatever conclusive and decided airs it has assumed , may have been only a noble puerilism and tyronism ; and probably the time is at hand when it will be once and again understood what has actually sufficed for th

In [5]:
# create input sequences using tokens lists
input_sequences = []
for sentence in sentences:
    tokens = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokens)):
        chain = tokens[:i+1]
        input_sequences.append(chain)

print(input_sequences[0:200])

[[3526, 573], [3526, 573, 8], [3526, 573, 8, 116], [3526, 573, 8, 116, 6], [3526, 573, 8, 116, 6, 7], [3526, 573, 8, 116, 6, 7, 145], [3526, 573, 8, 116, 6, 7, 145, 40], [3526, 573, 8, 116, 6, 7, 145, 40, 141], [6, 39], [6, 39, 14], [6, 39, 14, 1014], [6, 39, 14, 1014, 11], [6, 39, 14, 1014, 11, 5138], [6, 39, 14, 1014, 11, 5138, 8], [6, 39, 14, 1014, 11, 5138, 8, 17], [6, 39, 14, 1014, 11, 5138, 8, 17, 162], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117, 10], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117, 10, 29], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117, 10, 29, 26], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117, 10, 29, 26, 57], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117, 10, 29, 26, 57, 2154], [6, 39, 14, 1014, 11, 5138, 8, 17, 162, 5, 37, 117, 10, 29, 26, 57, 2154, 26], [6, 39, 14, 1014, 11, 5138, 8, 17, 1

In [6]:
# get info about the sentence lengths
longest_sentence_length = max([len(x) for x in input_sequences])
print("Longest sentence length: ", longest_sentence_length)

# pad sequences to the same length
input_sequences = numpy.array(pad_sequences(input_sequences))

# create predictors and label
x_train, y_train = input_sequences[:, :-1], input_sequences[:, -1]
y_train = ku.to_categorical(y_train, num_classes=number_unique_words)

print(x_train.shape)
print(y_train.shape)


Longest sentence length:  455
(97614, 454)
(97614, 9959)


In [7]:
# create lstm and blstm models

def create_lstm():
    lstm = Sequential(
        [
         Embedding(number_unique_words, 25, input_length=longest_sentence_length-1), # first layer shape needs to be specified; the rest will infer by self
         LSTM(100),
         Dense(number_unique_words, activation='softmax') # softmax because multi-class classification
        ]
    )
    lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # print a summary of our model's structure
    # output_array = lstm.predict(input_sequences)
    # print(output_array.shape)

    return lstm

def create_blstm():
    blstm = Sequential(
        [
         Embedding(number_unique_words, 25, input_length=longest_sentence_length-1),
         Bidirectional(LSTM(100, return_sequences=True)),
         Bidirectional(LSTM(100)),
         Dense(number_unique_words, activation='softmax')
        ]
    )

    blstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return blstm



lstm = create_lstm()
blstm = create_blstm()

# print(lstm.summary())
# print(blstm.summary())

In [None]:
# train our models

def train_lstm(epochs):
    history_lstm = lstm.fit(x_train, y_train, epochs=epochs, verbose=1)

    # matlab plot
    acc = history_lstm.history['accuracy']
    loss = history_lstm.history['loss']
    epochs = range(len(acc))
    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.title('Training accuracy')
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.title('Training loss')
    plt.legend()
    plt.show()

def train_blstm(epochs):
    history_blstm = blstm.fit(x_train, y_train, epochs=epochs, verbose=1)
    acc = history_blstm.history['accuracy']
    loss = history_blstm.history['loss']
    epochs = range(len(acc))
    plt.plot(epochs, acc, 'b', label='Training accuracy')
    plt.title('Training accuracy')
    plt.figure()
    plt.plot(epochs, loss, 'b', label='Training Loss')
    plt.title('Training loss')
    plt.legend()
    plt.show()


# train_lstm(10)
train_blstm(10)

In [None]:
def get_model_prediction(model, seed_text, desired_output_length):
    output_text = seed_text
    for i in range(desired_output_length):
        token_list = tokenizer.texts_to_sequences([output_text])[0]
        token_list = pad_sequences([token_list], maxlen=longest_sentence_length-1)

        prediction = model.predict_classes(token_list, verbose=0)
        # print(tokenizer.word_index.items())

        for word, index in tokenizer.word_index.items():
            if index == prediction:
                output_text += ' ' + word
                break
    
    return output_text

In [None]:
def main():
    # args = parse_args()
    # epochs = args.epochs

    seed_text = "We are"
    desired_output_length = 30

    # lstm_model = train_lstm(epochs)
    # blstm_model = train_blstm(epochs)

    lstm_prediction = get_model_prediction(lstm, seed_text, desired_output_length)
    blstm_prediction = get_model_prediction(blstm, seed_text, desired_output_length)

    print("\nLSTM generated: ", lstm_prediction)
    print("\nBLSTM generated: ", blstm_prediction)

main()
