In [1]:

import string
 
# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text
 
# turn a doc into clean tokens
def clean_doc(doc):
 # replace '--' with a space ' '
 doc = doc.replace('--', ' ')
 # split into tokens by white space
 tokens = doc.split()
 # remove punctuation from each token
 table = str.maketrans('', '', string.punctuation)
 tokens = [w.translate(table) for w in tokens]
 # remove remaining tokens that are not alphabetic
 tokens = [word for word in tokens if word.isalpha()]
 # make lower case
 tokens = [word.lower() for word in tokens]
 return tokens
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
 data = '\n'.join(lines)
 file = open(filename, 'w')
 file.write(data)
 file.close()
 
# load document
in_filename = 'bam.txt'
doc = load_doc(in_filename)
print(doc[:200])
 
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
 
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
 # select sequence of tokens
 seq = tokens[i-length:i]
 # convert into a line
 line = ' '.join(seq)
 # store
 sequences.append(line)
print('Total Sequences: %d' % len(sequences))
 
# save sequences to file
out_filename = 'bam_sequences.txt'
save_doc(sequences, out_filename)

This is not book of philosophy. It is exhortation. I hardly have anything 
to 
say to most who aren’t like me, still less do I care about convincing. 
This is 
account of my reveries. I tried to put, 
['this', 'is', 'not', 'book', 'of', 'philosophy', 'it', 'is', 'exhortation', 'i', 'hardly', 'have', 'anything', 'to', 'say', 'to', 'most', 'who', 'like', 'me', 'still', 'less', 'do', 'i', 'care', 'about', 'convincing', 'this', 'is', 'account', 'of', 'my', 'reveries', 'i', 'tried', 'to', 'put', 'as', 'brief', 'and', 'simple', 'as', 'i', 'could', 'the', 'thought', 'that', 'motivates', 'me', 'and', 'the', 'problem', 'faced', 'by', 'life', 'in', 'ascent', 'and', 'decline', 'i', 'was', 'convinced', 'to', 'write', 'this', 'book', 'by', 'certain', 'frogs', 'who', 'told', 'me', 'it', 'not', 'a', 'shame', 'that', 'hucksters', 'are', 'multiplying', 'lies', 'and', 'jizzing', 'their', 'filthy', 'doctrines', 'into', 'receptive', 'minds', 'everywhere', 'born', 'by', 'the', 'thousands', 'and', 'haunt', 

In [2]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
 
# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text
 
# load
in_filename = 'bam_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
 
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
 
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
 
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)
 
# save the model to file
model.save('bam_model.h5')
# save the tokenizer
dump(tokenizer, open('bam_tokenizer.pkl', 'wb'))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            369350    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 7387)              746087    
                                                                 
Total params: 1,266,337
Trainable params: 1,266,337
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch

In [17]:

from random import randint
from pickle import load
from keras.models import load_model
from keras.utils import pad_sequences
import numpy as np

# load doc into memory
def load_doc(filename):
 # open the file as read only
 file = open(filename, 'r')
 # read all text
 text = file.read()
 # close the file
 file.close()
 return text
 
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
 result = list()
 in_text = seed_text
 # generate a fixed number of words
 for _ in range(n_words):
    # encode the text as integer
    encoded = tokenizer.texts_to_sequences([in_text])[0]
    # truncate sequences to a fixed length
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    # predict probabilities for each word
    predictions = model.predict(encoded, verbose=0)
    predictions = predictions.astype('float64')
    predictions /= np.sum(predictions)
    # choose a random character based on probabilities
    probas = np.random.multinomial(1, predictions[0], 1)
    yhat = np.argmax(probas)
    # map predicted word index to word
    out_word = ''
    for word, index in tokenizer.word_index.items():
        if index == yhat:
            out_word = word
            break
    # append to input
    in_text += ' ' + out_word
    result.append(out_word)
 return ' '.join(result)
 
# load cleaned text sequences
in_filename = 'bam_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
 
# load the model
model = load_model('bam_model.h5')
 
# load the tokenizer
tokenizer = load(open('bam_tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = "sometime i think" #lines[randint(0,len(lines))]
print(seed_text + '\n')
 
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

sometime i think

of for this condition he appears magnified anointed and no two god admired and beyond these look this before a hundred alvarados will bloom and gates rings and returned to a life of god troll be attracts him the leading minds of his very clumsy attempts to affect its cruelty
