In [1]:
def read_file(file_path):
    with open(file_path) as f:
        str_text = f.read()
    
    return str_text

In [2]:
#read_file('moby_dick_four_chapters.txt')

In [3]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])

In [6]:
nlp.max_length = 1198623

In [7]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [8]:
d = read_file('moby_dick_four_chapters.txt')

In [9]:
tokens = separate_punc(d)

In [10]:
len(tokens)

11394

In [11]:
train_length = 25 + 1

text_sequences = []

for i in range(train_length, len(tokens)):
    seq = tokens[i-train_length:i]
    
    text_sequences.append(seq)

In [12]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [13]:
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [14]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [15]:
tokenizer = Tokenizer()

In [16]:
tokenizer.fit_on_texts(text_sequences)

In [17]:
sequence = tokenizer.texts_to_sequences(text_sequences)

In [18]:
#sequence[0]
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1029),
             ('money', 120),
             ('in', 5647),
             ('my', 1812),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7176),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 442),
             ('the', 15566),
             ('watery', 26),
  

In [19]:
vocab_size = len(tokenizer.word_counts)

In [20]:
import numpy as np

In [21]:
sequences = np.array(sequence)

In [22]:
sequences

array([[ 964,   14,  265, ..., 2704,   14,   24],
       [  14,  265,   51, ...,   14,   24,  965],
       [ 265,   51,  263, ...,   24,  965,    5],
       ...,
       [ 960,   12,  168, ...,  264,   53,    2],
       [  12,  168, 2703, ...,   53,    2, 2709],
       [ 168, 2703,    3, ...,    2, 2709,   26]])

In [23]:
X = sequences[:, :-1]
y = sequences[:, -1]

In [24]:
X

array([[ 964,   14,  265, ...,    6, 2704,   14],
       [  14,  265,   51, ..., 2704,   14,   24],
       [ 265,   51,  263, ...,   14,   24,  965],
       ...,
       [ 960,   12,  168, ...,   11,  264,   53],
       [  12,  168, 2703, ...,  264,   53,    2],
       [ 168, 2703,    3, ...,   53,    2, 2709]])

In [25]:
y

array([  24,  965,    5, ...,    2, 2709,   26])

In [26]:
from keras.utils import to_categorical

In [27]:
y = to_categorical(y, num_classes=vocab_size + 1)

In [28]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [29]:
seq_len = X.shape[1]

In [30]:
X.shape

(11368, 25)

In [31]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [32]:
def create_model(vocab_size, seq_len):
    
    model = Sequential()
    model.add(Embedding(vocab_size, seq_len, input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    
    model.add(Dense(vocab_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
    
    model.summary()
    
    return model

In [33]:
model = create_model(vocab_size + 1, seq_len)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67750     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 50)            15200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 2710)              138210    
Total params: 243,910
Trainable params: 243,910
Non-trainable params: 0
_________________________________________________________________


In [34]:
from pickle import dump, load

In [35]:
model.fit(X, y, batch_size=128, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x131bb3320>

In [36]:
model.save('myTextGenerator.h5')

In [37]:
dump(tokenizer, open('my_simpletokenizer', 'wb'))

In [38]:
from keras.preprocessing.sequence import pad_sequences

In [57]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    #print('input_text = ' + input_text)
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text], maxlen = seq_len, truncating = 'pre')
        
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        #print(pred_word)
        
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [58]:
import random

In [59]:
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

In [60]:
random_seed_text = text_sequences[random_pick]

In [61]:
random_seed_text
seed_text = ' '.join(random_seed_text)
seed_text

'and throwing the clothes to one side he really did this in not only a civil but a really kind and charitable way i stood looking'

In [62]:
generate_text(model, tokenizer, seq_len, seed_text = seed_text, num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'